1 Import original ASV tables - quality control

Set up working R environment and import 18S ASV table. Modify input tables and import as phyloseq objects in order to perform quality control removal of contaminant ASVs (decontam)

library(tidyverse);library(reshape2);library(cowplot);library(patchwork);library(phyloseq);library(decontam)
load("data-input/GR-ASVtables-updatedTax.RData", verbose = TRUE)
## Loading objects:
##   GR_tagseq_longformat
##   GR_tagseq_wideformat

1.1 Clean ASV table with ‘decontam’

Import ASV table as phyloseq object, note control samples.

taxmat <- GR_tagseq_wideformat %>% 
  select(Feature.ID, Taxon_updated) %>% 
  separate(Taxon_updated, c("Kingdom","Supergroup","Division","Class","Order","Family","Genus","Species"), sep = ";", remove = FALSE) %>% 
  column_to_rownames(var = "Feature.ID") %>% 
  as.matrix
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 4828 rows [3, 5,
## 9, 11, 14, 20, 23, 28, 33, 34, 35, 37, 39, 40, 44, 46, 47, 53, 55, 57, ...].
# class(taxmat)
# head(taxmat)
asvmat <- GR_tagseq_wideformat %>% 
  select(Feature.ID, starts_with(c("Gorda", "Axial"))) %>% 
  column_to_rownames(var = "Feature.ID") %>% 
  as.matrix
row.names(taxmat)<-row.names(asvmat)
# class(asvmat);class(taxmat)

ASV = otu_table(asvmat, taxa_are_rows = TRUE) #phyloseq command
TAX = tax_table(taxmat)
physeq <- phyloseq(ASV, TAX)
physeq #Phyloseq object
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 9175 taxa and 34 samples ]
## tax_table()   Taxonomy Table:    [ 9175 taxa by 9 taxonomic ranks ]
# Include additional sample names
samplenames <- as.data.frame(colnames(asvmat))
# samplenames; head(asvtab)
colnames(samplenames)[1]<-"SAMPLE"

# Import metadata
ventnames <- read.delim("data-input/ventnames-gordaridge.txt")
# names(ventnames);head(ventnames)
# View(ventnames)
colnames(ventnames)[1]<-"SAMPLE"
#
samplenames_1 <- left_join(samplenames, ventnames)
## Joining, by = "SAMPLE"
row.names(samplenames_1)<-sample_names(physeq) 
samplenames_1 <- samplenames_1 %>% unite(LocationName_Sampletype, LocationName, Sampletype, sep = " ", remove = FALSE)

# Convert to phyloseq object
sampledata <- sample_data(samplenames_1)
# Merge with other data
physeq_names = merge_phyloseq(physeq, sampledata)
# physeq_names
# sample_data(physeq_names)

1.2 Decontam step

# Decontam:
physeq_names
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 9175 taxa and 34 samples ]
## sample_data() Sample Data:       [ 34 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 9175 taxa by 9 taxonomic ranks ]
# Check out library size of my data
df <- as.data.frame(sample_data(physeq_names))
df$LibrarySize <- sample_sums(physeq_names)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
#
ggplot(data=df, aes(x=Index, y=LibrarySize, fill=Sample_or_Control, shape=LOCATION)) + 
  geom_point(color="black", size=3, aes(shape=LOCATION)) +
  scale_shape_manual(values = c(21,22,23)) +
  theme_bw()

> Shows that out of the 3 ship blanks I have, one of the sames has a pretty large library size, otherwise, control samples have very small library sizes.

1.3 Identify contaminant ASVs

# Assign negative control designation
sample_data(physeq_names)$is.neg <- sample_data(physeq_names)$Sample_or_Control == "Control Sample"

# ID contaminants using Prevalence information
contamdf.prev <- isContaminant(physeq_names, method="prevalence", neg="is.neg", threshold = 0.5, normalize = TRUE) 
table(contamdf.prev$contaminant) # Report number of ASVs IDed as contamintants
## 
## FALSE  TRUE 
##  9141    34

0.5 - this threshold will ID contaminants in all samples that are more prevalent in negative controls than in positive samples. In this study, control samples included 1 lab-based blank and 3 ship-board blanks taken at the time of field study. Results showed 34 ASVs to be considered “contaminants”

# Make phyloseq object of presence-absence in negative controls and true samples
## change to presence absence
gr.pa <- transform_sample_counts(physeq_names, function(abund) 1*(abund>0))

# isolate PA of positive and negative samples
gr.pa.neg <- prune_samples(sample_data(gr.pa)$Sample_or_Control == "Control Sample", gr.pa)
gr.pa.pos <- prune_samples(sample_data(gr.pa)$Sample_or_Control == "True Sample", gr.pa)

1.4 Remove contaminant ASVs from data

# Subset TRUE contaminants
contams <- subset(contamdf.prev, contaminant == "TRUE")
contams$Feature.ID <- row.names(contams)
# head(contams);dim(contams)
list_of_contams <- as.character(contams$Feature.ID)
#
# Explore taxa IDed as contaminants
taxa_list <- as.data.frame(taxmat)
taxa_list$Feature.ID <- row.names(taxa_list)

taxa_contams <- left_join(contams, taxa_list)
# write_delim(taxa_contams, path = "List-of-contaminant-ASVs.txt", delim = "\t")

# Plot total sequences and which are contaminants
# Remove contaminant and count sequence sums per sample to see which samples had the highest number of contamiant sequences removed.
# After remove contaminants, what % of sequences is removed?
# head(GR_tagseq_counts[1:2,])
GR_tagseq_longformat$CONTAM <- "Pass"
# head(contams[1:2,])
# str(list_of_contams)
GR_tagseq_longformat$CONTAM[GR_tagseq_longformat$Feature.ID %in% list_of_contams] = "Fail"
# head(GR_tagseq_counts[1:2,])

# Make character list of all feature.ids to KEEP:
keep1<- subset(GR_tagseq_longformat, CONTAM %in% "Pass")
# length(unique(keep1$Feature.ID))
keep_asvs <- as.character(unique(keep1$Feature.ID)) #see below
#
passfail <- GR_tagseq_longformat %>%
  group_by(SAMPLE, CONTAM) %>%
  summarise(SUM_CONTAM = sum(COUNT)) %>%
  data.frame

1.5 Save output

passfail_wID <- left_join(passfail, ventnames, by = "SAMPLE")
# Original number of reads
sum(GR_tagseq_longformat$COUNT)
## [1] 1569829
# Original number of ASVs
length(unique(GR_tagseq_longformat$Feature.ID))
## [1] 9175
# unique(GR_tagseq_counts$SAMPLEID)
GR_tagseq_counts_noCTRL <- subset(GR_tagseq_longformat, !(SAMPLEID %in% "CTRL"))
# New total number of sequences
sum(GR_tagseq_counts_noCTRL$COUNT)
## [1] 1479273
counts_decont <- subset(GR_tagseq_longformat, !(Feature.ID %in% list_of_contams))
length(unique(counts_decont$Feature.ID)) - length(unique(GR_tagseq_longformat$Feature.ID)) # Confirm 34 lines removed
## [1] -34

2 Percent of sequence removed after decontam step

# % of sequences was removed following decontam; this is counting the ship blank samples themselves
100*(1-(sum(counts_decont$COUNT)/sum(GR_tagseq_counts_noCTRL$COUNT)))
## [1] 1.23581
# head(counts_decont)
# Breakdown by samples:
passfail_wide <- dcast(passfail, SAMPLE ~ CONTAM)
## Using SUM_CONTAM as value column: use value.var to override.
passfail_wide$PercLossSeq <- paste(100*(passfail_wide$Fail/(passfail_wide$Fail+passfail_wide$Pass)))
# dim(passfail_wide)
# write.csv(passfail_wide, file="PercSeqLost-decontam.csv")
# breakdown by sample - reports % lost per sample

# Remove contaminant sequences from phyloseq object:
# Subset TRUE contaminants
# ?prune_taxa
# class(keep_asvs)
physeq_tmp <- prune_taxa(keep_asvs, physeq_names)
# sample_data(physeq_tmp)

# Remove one sample with too few sequences
physeq_clean <- subset_samples(physeq_tmp, sample_names(physeq_tmp) !="GordaRidge_BSW020_sterivex_2019_REPa")
# sample_data(physeq_clean)
# physeq_clean
# Remove control samples from data frame
tmp <- subset(GR_tagseq_longformat, !(SAMPLEID %in% "CTRL")) # Remove controls, get list of sample names that are controls
samples_keep <- as.character(unique(tmp$SAMPLE))
physeq_clean_true <- prune_samples(samples_keep, physeq_clean)

# Save as R Data
save(counts_decont, file="data-input/GR-ASV-table-clean-19-08-2020.RData")

3 Plot high level taxonomy of protists in situ

Import cleaned ASV data, curate taxonomic assignments specific to protists.

load("data-input/GR-ASV-table-clean-19-08-2020.RData", verbose=TRUE) # after decontam clenaing
## Loading objects:
##   counts_decont
gr_counts <- counts_decont %>% 
  filter(COUNT > 0) %>% 
  separate(Taxon_updated, c("Kingdom","Supergroup","Division","Class","Order","Family","Genus","Species"), sep = ";", remove = FALSE) %>% 
  data.frame
# head(gr_counts)
tax_only_tmp <- gr_counts %>% 
  select(Taxon_updated, Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species) %>% 
  distinct() %>% 
  data.frame
ventnames <- read.delim("data-input/ventnames-gordaridge.txt")
colnames(ventnames)[1]<-"SAMPLE"
# Join with dataframe
gr_counts_name <- gr_counts %>% 
  left_join(select(ventnames, SAMPLE, LOCATION_SPECIFIC, Sampletype, LocationName)) %>% 
  data.frame
## Joining, by = "SAMPLE"
#ventnames[c(1,3,5:6)]
gr_counts_name$LocationName[gr_counts_name$LOCATION == "Shipblank"]="Shipblank"

3.1 Taxonomy curation - PR2

pr2_curate <- function(df){
  # Add a column
  df$Taxa <-"Unassigned-Eukaryote"
  df$Taxa[df$Supergroup == "Alveolata"]="Alveolata-Other"
  df$Taxa[df$Division == "Ciliophora"]="Alveolata-Ciliates"
  df$Taxa[df$Division == "Dinoflagellata"]="Alveolata-Dinoflagellates"
  df$Taxa[df$Class == "Syndiniales"] = "Alveolata-Syndiniales"
  df$Taxa[df$Class == "Apicomplexa"]="Alveolata-Apicomplexa"
  df$Taxa[df$Supergroup == "Hacrobia"]="Hacrobia-Other"
  df$Taxa[df$Division == "Cryptophyta"]="Hacrobia-Cryptophyta"
  df$Taxa[df$Division == "Haptophyta"]="Hacrobia-Haptophyta"
  df$Taxa[df$Supergroup == "Opisthokonta"]="Opisthokonta-Other"
  df$Taxa[df$Division == "Fungi"]="Opisthokonta-Fungi"
  df$Taxa[df$Division == "Metazoa"]="Opisthokonta-Metazoa"
  df$Taxa[df$Supergroup == "Stramenopiles"]="Stramenopiles-Other"
  df$Taxa[df$Division == "Ochrophyta"]="Stramenopiles-Ochrophyta"
  mast <- unique(filter(df, grepl("MAST", Class)) %>% select(Class))
  mast_list <- as.character(mast$Class)
  df$Taxa[df$Class %in% mast_list]="Stramenopiles-MAST"
  df$Taxa[df$Supergroup == "Archaeplastida"]="Archaeplastida-Other"
  df$Taxa[df$Division == "Chlorophyta"]="Archaeplastida-Chlorophyta"
  df$Taxa[df$Supergroup == "Excavata"]="Excavata"
  # df$Taxa[df$Division == "Metamonada"]="Excavata-Metamonada"
  df$Taxa[df$Supergroup == "Amoebozoa"]="Amoebozoa"
  # df$Taxa[df$Division == "Lobosa"]="Amoebozoa-Lobosa"
  df$Taxa[df$Supergroup == "Rhizaria"]="Rhizaria-Other"
  df$Taxa[df$Division == "Cercozoa"]="Rhizaria-Cercozoa"
  df$Taxa[df$Division == "Radiolaria"]="Rhizaria-Radiolaria"
  return(df)
}
gr_counts_wtax <- pr2_curate(gr_counts_name)
# head(gr_counts_wtax[1:3,])
# unique(gr_counts_wtax$Taxa)

Output is the full ASV table with added columns for curated taxonomy. Above also provides a list of the unique taxonomic names assigned.

gr_counts_wtax_samplesonly <- subset(gr_counts_wtax, !(Sampletype == "control"))

## To average across replicates, modify SUPR sample names
gr_counts_filter <- gr_counts_wtax_samplesonly
gr_counts_filter$SAMPLEID<- sub("SUPRS9", "SUPR", gr_counts_filter$SAMPLEID)
gr_counts_filter$SAMPLEID<- sub("SUPRS11", "SUPR", gr_counts_filter$SAMPLEID)
gr_counts_filter$SAMPLEID<- sub("SUPRS10", "SUPR", gr_counts_filter$SAMPLEID)
gr_counts_filter$SAMPLEID<- sub("SUPRS2", "SUPR", gr_counts_filter$SAMPLEID)
gr_counts_filter$SAMPLEID<- sub("SUPRS3", "SUPR", gr_counts_filter$SAMPLEID)
gr_counts_filter$SAMPLEID<- sub("SUPRS1", "SUPR", gr_counts_filter$SAMPLEID)
unique(gr_counts_filter$Taxa)
##  [1] "Alveolata-Ciliates"         "Unassigned-Eukaryote"      
##  [3] "Opisthokonta-Metazoa"       "Archaeplastida-Other"      
##  [5] "Opisthokonta-Fungi"         "Stramenopiles-MAST"        
##  [7] "Stramenopiles-Ochrophyta"   "Alveolata-Dinoflagellates" 
##  [9] "Alveolata-Syndiniales"      "Rhizaria-Radiolaria"       
## [11] "Hacrobia-Haptophyta"        "Opisthokonta-Other"        
## [13] "Archaeplastida-Chlorophyta" "Rhizaria-Cercozoa"         
## [15] "Amoebozoa"                  "Hacrobia-Other"            
## [17] "Stramenopiles-Other"        "Hacrobia-Cryptophyta"      
## [19] "Alveolata-Other"            "Rhizaria-Other"            
## [21] "Excavata"

3.1.1 Get stats on taxonomy assignments

# Sum of all sequences
a <- sum(gr_counts_filter %>% filter(!(SAMPLEID == "CTRL")) %>% select(COUNT)); a
## [1] 1434482
# Total ASVs
dim(unique(gr_counts_filter %>% filter(!(SAMPLEID == "CTRL")) %>% select(Feature.ID)))[1]
## [1] 9028
# Percentage of all sequences Unassigned Eukaryote
x <- sum(gr_counts_filter %>% filter(!(SAMPLEID == "CTRL")) %>% filter(Taxon_updated == "Eukaryota") %>% select(COUNT))
100*(x/a)
## [1] 2.823876
# Total ASVs left "Unassigned-Eukaryote"
dim(unique(gr_counts_filter %>% filter(!(SAMPLEID == "CTRL")) %>% filter(Taxon_updated == "Eukaryota") %>% select(Feature.ID)))[1]
## [1] 1058
# Percentage of all sequences assigned Opisthokonts
x <- sum(gr_counts_filter %>% filter(!(SAMPLEID == "CTRL")) %>% filter(Supergroup == "Opisthokonta") %>% select(COUNT))
100*(x/a)
## [1] 12.92606
dim(unique(gr_counts_filter %>% filter(!(SAMPLEID == "CTRL")) %>% filter(Supergroup == "Opisthokonta") %>% select(Feature.ID)))[1]
## [1] 615

3.2 Prepare dataframe to for barplot

3.2.1 Average ASV sequence count across replicate samples

Average ASV sequence counts across replicate samples, COUNT_AVG column will now equal the ASV sequence count value across replicates

gr_counts_avg_wtax <- gr_counts_filter %>%
  group_by(Feature.ID, SAMPLEID, Sampletype,  LOCATION_SPECIFIC, LocationName, Taxon_updated, Kingdom, Supergroup, Division, Class, Order, Family, Genus, Species, Taxa) %>%
  summarise(COUNT_AVG = mean(COUNT)) %>%
  as.data.frame
# dim(gr_counts_filter);dim(gr_counts_avg_wtax)

# Save output and view
# save(gr_counts_filter,gr_counts_wtax, gr_counts_avg_wtax, file="data-input/GordaRidge-ASVtable-avg-19-08-2020.RData")
# tmp <- gr_counts_avg_wtax %>% select(Taxa, Taxon_updated, Kingdom, Supergroup, Division, Class, Order, Family, Genus, Species) %>% distinct() %>% data.frame
# write_delim(tmp, path = "tax-tmp-2.txt", delim = "\t")

3.2.2 Sum ASV sequence counts to taxonomic level

Now sum ASV counts by curated taxonomic level

gr_counts_avg_TAXA <- gr_counts_avg_wtax %>%
    # sum by like taxa
    group_by(SAMPLEID, Sampletype, LocationName, Taxa) %>% 
    summarise(SUM = sum(COUNT_AVG)) %>% 
    unite(SAMPLE, LocationName, Sampletype, SAMPLEID, sep = " ", remove = FALSE) %>% 
    data.frame
# View(gr_counts_avg_TAXA)
# head(gr_counts_avg_TAXA)
supp_table <- gr_counts_avg_TAXA %>% 
  pivot_wider(names_from = Taxa, values_from = SUM, values_fill = 0)
write_delim(supp_table, path = "Suppl-18S-relabun.txt", delim = "\t")

3.2.3 Plot factoring and parameters

# unique(gr_counts_avg_TAXA$Taxa)
level2ORDER <- c("Alveolata-Ciliates","Alveolata-Dinoflagellates","Alveolata-Syndiniales","Alveolata-Other","Rhizaria-Cercozoa","Rhizaria-Radiolaria","Rhizaria-Other","Stramenopiles-MAST","Stramenopiles-Ochrophyta","Stramenopiles-Other","Hacrobia-Cryptophyta","Hacrobia-Haptophyta","Hacrobia-Other","Amoebozoa","Excavata","Archaeplastida-Chlorophyta","Archaeplastida-Other","Opisthokonta-Fungi","Opisthokonta-Metazoa","Opisthokonta-Other","Unassigned-Eukaryote")

level2color <- c("#f1eef6","#d7b5d8","#df65b0","#ce1256","#fc9272","#ef3b2c","#800026","#fff7bc","#fec44f","#d95f0e","#74c476","#238b45","#00441b","#7fcdbb","#084081","#2b8cbe","#016c59","#bcbddc","#807dba","#54278f","#bdbdbd")
gr_counts_avg_TAXA$LEVEL2ORDER <- factor(gr_counts_avg_TAXA$Taxa, levels=level2ORDER)
names(level2color)<-level2ORDER

sample_order_all<-c("Shallow seawater in situ sterivex","Deep seawater in situ sterivex","Mt Edwards Plume in situ sterivex","Mt Edwards Plume Grazing T0","Mt Edwards Plume Grazing T24","Mt Edwards Plume Grazing T36","Mt Edwards Vent in situ SUPR","Mt Edwards Vent Grazing T0","Mt Edwards Vent Grazing T36","Venti Latte Vent in situ SUPR","Venti Latte Vent Grazing T0","Venti Latte Vent Grazing T36","Candelabra Vent in situ SUPR","Candelabra Vent Grazing T24","SirVentsAlot Vent in situ SUPR","SirVentsAlot Vent Grazing T24")
sample_name_all<-c("Shallow seawater","Deep seawater","Mt Edwards Plume","Mt Edwards Plume T0","Mt Edwards Plume T23","Mt Edwards Plume T35","Mt Edwards Vent","Mt Edwards Vent T0","Mt Edwards Vent  T36","Venti Latte Vent","Venti Latte Vent T0","Venti Latte Vent T29","Candelabra Vent","Candelabra Vent T26","SirVentsAlot Vent","SirVentsAlot Vent T24")
gr_counts_avg_TAXA$SAMPLE_ORDER <- factor(gr_counts_avg_TAXA$SAMPLE, levels = sample_order_all, labels = sample_name_all)

exporder <- c("sterivex", "SUPR", "T0", "T24", "T36")
gr_counts_avg_TAXA$SAMPLEID_ORDER <- factor(gr_counts_avg_TAXA$SAMPLEID, levels = exporder)
gr_counts_avg_TAXA$LOCATION_ORDER <- factor(gr_counts_avg_TAXA$LocationName, levels=c("Shallow seawater", "Deep seawater", "Mt Edwards Plume", "Mt Edwards Vent", "Venti Latte Vent", "Candelabra Vent", "SirVentsAlot Vent"))
# head(gr_counts_avg_TAXA)

3.3 Protistan community barplots

barplot_lev2 <- function(df){
  ggplot(df, aes(x=SAMPLE_ORDER, y=SUM, fill=LEVEL2ORDER)) +
    geom_bar(stat="identity", position="fill", color="black") +
    scale_fill_manual(values=level2color) +
    scale_y_continuous(expand = c(0,0))+
    theme(legend.position = "right",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", face="bold", size = 12),
          axis.text.y = element_text(color="black", face="bold", size = 12),
          strip.text = element_blank(),
          legend.title = element_blank()) +
    labs(x="", y="Relative abundance")+
    facet_grid(.~LOCATION_ORDER, space = "free", scales = "free")
}
insitu <- c("sterivex", "SUPR")
rm <- c("Unassigned", "Opisthokonta-Other", "Opisthokonta-Fungi", "Opisthokonta-Metazoa")

# svg("GR-insitu-tax.svg", w = 8, h = 8)
barplot_lev2(filter(gr_counts_avg_TAXA, (SAMPLEID %in% insitu & !(Taxa %in% rm))))

# dev.off()
# options(repr.plot.width = 14, repr.plot.height = 8)

insitu <- c("sterivex", "SUPR")
rm <- c("Unassigned", "Opisthokonta-Other", "Opisthokonta-Fungi", "Opisthokonta-Metazoa")

# svg("GR-all-tax-supplementary.svg", w = 20, h = 8)
plot_grid(
    barplot_lev2(filter(gr_counts_avg_TAXA, !(Taxa %in% rm))) + theme(legend.position = "none"),
        barplot_lev2(filter(gr_counts_avg_TAXA)),
        ncol = 2, rel_widths = c(0.70,1), labels = c("a", "b"))

# dev.off()

4 Import 16S reads

# Import metadata for 16S
ventnames_16 <- read.delim("data-input/ventnames-gordaridge-16S.txt")
ventnames_16_mod <- ventnames_16 %>% 
    mutate(location = case_when(
        grepl("Plume", SAMPLE_AMY) ~ "Plume",
        grepl("BSW", SAMPLE_AMY) ~ "BSW", 
        grepl("Vent", LocationName) ~ "Vent"),
          NA_NUM = SAMPLEID_16S) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "NA108", "")) %>%
    mutate(NA_NUM = str_replace(NA_NUM, "NA080", "")) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "aSTEP", "")) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "bSTEP", "")) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "STEP20200226", "")) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "STEP", "")) %>% 
    unite(NEW_SAMPLEID, location, NA_NUM, sep ="") %>% 
    data.frame
countbac <- read.delim("data-input/CountTable-wtax-16s-plus3-2020-06-23.txt")

# Remove samples that were repeated
rm <- c("NA108003STEP", "NA108039STEP", "NA108087STEP")

# Include only samples that also appear int he 18S dataset
x <- as.character(unique(ventnames$LocationName))
tmp <- countbac %>%
  select(-all_of(rm)) %>% 
  pivot_longer(starts_with("NA"), names_to = "SAMPLEID_16S") %>% 
  left_join(ventnames_16_mod) %>% 
  filter(LocationName %in% x) %>%
  data.frame
## Joining, by = "SAMPLEID_16S"
sum(tmp$value)
## [1] 1090141
length(unique(tmp$Feature.ID))
## [1] 6532

4.1 Format 16S ASV table for plotting

bac_df_plot <- countbac %>%
  separate(Taxon, sep = ";D_[[:digit:]]__", into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), remove = TRUE, extra = "merge") %>% # Warnings are OK with NAs
  mutate_if(is.character, str_replace_all, pattern = "D_0__", replacement = "") %>%
  filter(Domain %in% "Archaea" | Domain %in% "Bacteria") %>% #Select only archaea and bacteria, removing unassigned
  select(-all_of(rm)) %>% # Remove samples we are replacing
  pivot_longer(starts_with("NA"), names_to = "SAMPLEID_16S") %>% 
  left_join(ventnames_16_mod) %>% 
  filter(LocationName %in% x) %>% 
  data.frame
# head(bac_df_plot)
sum(bac_df_plot$value)
## [1] 1089209
length(unique(bac_df_plot$Feature.ID))
## [1] 6497

4.1.1 Subset 16S ASVs by abundance

bac_df_filtered <- bac_df_plot %>% 
  ungroup() %>% 
  mutate(TOTALSEQ = sum(value)) %>% 
  group_by(Feature.ID) %>% 
    summarise(SUM = sum(value),
             RELABUN = SUM/TOTALSEQ) %>%
  filter(RELABUN >= 0.001) %>% 
  add_column(KEEP = "YES") %>% 
  right_join(bac_df_plot) %>% 
  filter(KEEP == "YES") %>% 
  data.frame
## `summarise()` regrouping output by 'Feature.ID' (override with `.groups` argument)
## Joining, by = "Feature.ID"

4.1.2 Curate 16S taxonomic assignment

# colnames(bac_df_plot)
# head(bac_df_plot)

# ASVs that are within the top 0.1% abundance
top16s_asvs <- as.character(unique(bac_df_filtered$Feature.ID))

# Subset a dataframe with only ASVs
asv_curated <- bac_df_plot %>% 
  select(Feature.ID, Domain, Phylum, Class, Order, Family, Genus, Species) %>% 
  distinct() %>% data.frame
# head(asv_curated)
# head(bac_df_plot)

Here we are curating the 16S taxonomy assignments so we can get an informative look at the in situ bacteria population diversity and biogeography.

# Add a column for updated taxonomy name
curate_16s_tax <- function(df, THRESHOLD){
  # List the class and genus level designations that should be named at class level
  class <- c("Alphaproteobacteria", "Deltaproteobacteria", "Gammaproteobacteria", "Nitrososphaeria", "Thermoplasmata")
  genus <- c("Arcobacter","Campylobacter","Hydrogenimonas","Nitratiruptor","Nitratifractor","Sulfurovum","Sulfurimonas","Caminibacter", "Psychrilyobacter", "SUP05 cluster")
  # List the appropriate taxonomic names for this whole level to be placed into "Other" category
  class_other <- c("Verrucomicrobiae")
  phylum_other <- c("Planctomycetes", "Poribacteria", "Cyanobacteria", "WPS-2")
  order_other <- c("Synechococcales")
  totalsumseq <- sum(df$value) # total number of sequences
  tmp_filter <- df %>% 
    group_by(Feature.ID) %>% 
    summarise(SUM = sum(value)) %>% 
    mutate(RELABUN = 100*(SUM/totalsumseq)) %>% 
    filter(RELABUN >= THRESHOLD) %>% #User-defined relabun threshold
    data.frame
  keep_asvs_relabun <- as.character(unique(tmp_filter$Feature.ID))
  df2 <- df %>%
    mutate(Tax_update = Phylum) %>% # Default to filling new taxa level to phylum
    mutate(Tax_update = ifelse(Feature.ID %in% keep_asvs_relabun, Tax_update, "Other"), # Change name to other if it falls below relative abundance Threshold
           Tax_update = ifelse(Class %in% class, paste(Phylum, Class, sep = "-"), Tax_update),
           Tax_update = ifelse(Order == "Methylococcales", paste(Phylum, "Methylococcales", sep = "-"), Tax_update),
           Tax_update = ifelse(Order == "Oceanospirillales", paste(Phylum, "Oceanospirillales", sep = "-"), Tax_update),
           Tax_update = ifelse(Order == "Thioglobaceae", paste(Phylum, "Thioglobaceae", sep = "-"), Tax_update),
           Tax_update = ifelse(Family == "Nitrospinaceae", paste(Phylum, "Nitrospinaceae", sep = "-"), Tax_update),
           Tax_update = ifelse(Class %in% class_other, "Other", Tax_update),
           Tax_update = ifelse(Phylum %in% phylum_other, "Other", Tax_update),
           Tax_update = ifelse(Order %in% order_other, "Other", Tax_update),
           Tax_update = ifelse(Genus %in% genus, paste(Phylum, Genus, sep = "-"), Tax_update))
   return(df2)
}
# head(bac_df_plot) # Add updated taxa list to this dataframe
# unique(bac_df_plot$LocationName)
bac_wcuratedtax <- curate_16s_tax(bac_df_plot %>% 
  filter(!(Genus == "Ralstonia")), 0.1) #Will place ASVs <0.1% abundance into "Other category"
## `summarise()` ungrouping output (override with `.groups` argument)
# View(asv_curated_updated)
# unique(bac_wcuratedtax$Tax_update)
# length(unique(bac_wcuratedtax$Tax_update))
tax_16s <- bac_wcuratedtax %>% 
  select(Tax_update, Domain:Species) %>% 
  distinct()
# write_delim(tax_16s, path = "tax-key-16s-21-08-2020.txt", delim = "\t")

4.1.3 Average across replicates & sub to taxa

# update exisiting taxonomy
bac_wcuratedtax_toplot <- bac_wcuratedtax %>% 
  # left_join(asv_curated_updated) %>% 
  # filter(!(Genus == "Ralstonia")) %>% 
  # Average ASV seq count across replicates
  group_by(Feature.ID, LocationName, Tax_update) %>% 
  summarise(AVG_count = mean(value)) %>% 
  ungroup() %>% 
  group_by(LocationName, Tax_update) %>% 
  summarise(SUM_bytax = sum(AVG_count)) %>% 
  data.frame
# unique(bac_wcuratedtax$Tax_update)
unique(bac_wcuratedtax_toplot$LocationName)
## [1] Candelabra Vent   Deep seawater     Mt Edwards Plume  Mt Edwards Vent  
## [5] Shallow seawater  SirVentsAlot Vent Venti Latte Vent 
## 10 Levels: Blue Ciliate Candelabra Vent Deep seawater ... Venti Latte Vent
bac_wcuratedtax_toplot$LOCATION <- factor(bac_wcuratedtax_toplot$LocationName, levels = c("Shallow seawater", "Deep seawater", "Mt Edwards Plume", "Mt Edwards Vent", "Venti Latte Vent", "SirVentsAlot Vent", "Candelabra Vent"))
#
# bac_wtax <- bac_df_plot %>% 
#   left_join(asv_curated_updated) %>% 
#   filter(!(Genus == "Ralstonia")) %>% 
#   # Average ASV seq count across replicates
#   group_by(Feature.ID, LocationName, Tax_update) %>% 
#   summarise(AVG_count = mean(value)) %>% 
#   data.frame
# write_delim(bac_wtax, path = "bac-wtaxassigned-20-08-2020.txt", delim = "\t")

4.1.4 Make bar plots

tax_color<-c("#a50026","#d73027","#f46d43","#fdae61","#fee090","#ffffbf","#40004b","#762a83","#9970ab","#c2a5cf","#e7d4e8","#d9f0d3","#a6dba0","#5aae61","#1b7837","#00441b","#e0f3f8","#abd9e9","#74add1","#4575b4","#313695","#8e0152","#c51b7d","#de77ae","#f1b6da","#fde0ef","#e6f5d0","#b8e186","#7fbc41","#4d9221","#276419","#bababa","#878787","#4d4d4d","#1a1a1a")
tax_order <- c("Epsilonbacteraeota-Arcobacter","Epsilonbacteraeota-Caminibacter","Epsilonbacteraeota-Campylobacter","Epsilonbacteraeota-Hydrogenimonas","Epsilonbacteraeota-Nitratifractor","Epsilonbacteraeota-Nitratiruptor","Epsilonbacteraeota-Sulfurimonas","Epsilonbacteraeota-Sulfurovum","Proteobacteria-Alphaproteobacteria","Proteobacteria-Deltaproteobacteria","Proteobacteria-Gammaproteobacteria","Proteobacteria-Methylococcales","Proteobacteria-Oceanospirillales","Proteobacteria-SUP05 cluster","Acidobacteria","Actinobacteria","Aquificae","Bacteroidetes","Chloroflexi","Thaumarchaeota-Nitrososphaeria","Euryarchaeota-Thermoplasmata","Fusobacteria-Psychrilyobacter","Marinimicrobia (SAR406 clade)","Nitrospinae-Nitrospinaceae","Other")

bac_wcuratedtax_toplot$TAX_ORDER <- factor(bac_wcuratedtax_toplot$Tax_update, levels = tax_order)

barplot_16s <- function(df){
  ggplot(df, aes(x = LOCATION, y = SUM_bytax, fill = TAX_ORDER)) +
    geom_bar(stat = "identity", position = "fill", color = "black") +
    scale_fill_manual(values = tax_color) +
    scale_y_continuous(expand = c(0,0)) +
    theme(legend.position = "right",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle = 90, hjust=1, vjust=0.5, color = "black", face="bold", size = 12),
          axis.text.y = element_text(color = "black", face="bold", size = 12),
          strip.text = element_blank(),
          legend.title = element_blank()) +
    labs(x="", y="Relative abundance") +
    facet_grid(.~LOCATION, space = "free", scales = "free") +
    guides(fill=guide_legend(ncol=1))
}
# svg("16s-curated-static.svg", h=8, w=8)
barplot_16s(bac_wcuratedtax_toplot)

# dev.off()
euk_key <- get_legend(barplot_lev2(filter(gr_counts_avg_TAXA, (SAMPLEID %in% insitu & !(Taxa %in% rm)))))
prok_key <- get_legend(barplot_16s(bac_wcuratedtax_toplot))
# svg("panel-18s-16s-barplot.svg", w=20, h = 10)
plot_grid(
  barplot_lev2(filter(gr_counts_avg_TAXA, (SAMPLEID %in% insitu & !(Taxa %in% rm)))) + theme(legend.position = "none"),
  euk_key, 
  barplot_16s(bac_wcuratedtax_toplot) + theme(legend.position = "none"),
  prok_key,
  rel_widths = c(1,0.5,1,0.5),
  align = ("v"),
  labels = c("a.", "", "b.", ""),
  ncol = 4)
## Warning: Graphs cannot be vertically aligned unless the axis parameter is set.
## Placing graphs unaligned.

# dev.off()
library(plotly)
head(bac_wcuratedtax)
##                         Feature.ID  Domain         Phylum           Class
## 1 139ad7417ef0dfb630699faea2eb5e06 Archaea Thaumarchaeota Nitrososphaeria
## 2 139ad7417ef0dfb630699faea2eb5e06 Archaea Thaumarchaeota Nitrososphaeria
## 3 139ad7417ef0dfb630699faea2eb5e06 Archaea Thaumarchaeota Nitrososphaeria
## 4 139ad7417ef0dfb630699faea2eb5e06 Archaea Thaumarchaeota Nitrososphaeria
## 5 139ad7417ef0dfb630699faea2eb5e06 Archaea Thaumarchaeota Nitrososphaeria
## 6 139ad7417ef0dfb630699faea2eb5e06 Archaea Thaumarchaeota Nitrososphaeria
##              Order            Family                      Genus
## 1 Nitrosopumilales Nitrosopumilaceae uncultured marine archaeon
## 2 Nitrosopumilales Nitrosopumilaceae uncultured marine archaeon
## 3 Nitrosopumilales Nitrosopumilaceae uncultured marine archaeon
## 4 Nitrosopumilales Nitrosopumilaceae uncultured marine archaeon
## 5 Nitrosopumilales Nitrosopumilaceae uncultured marine archaeon
## 6 Nitrosopumilales Nitrosopumilaceae uncultured marine archaeon
##                      Species Confidence         SAMPLEID_16S value
## 1 uncultured marine archaeon    0.86145        NA108001aSTEP  1179
## 2 uncultured marine archaeon    0.86145        NA108001bSTEP  2971
## 3 uncultured marine archaeon    0.86145 NA108003STEP20200226  6931
## 4 uncultured marine archaeon    0.86145         NA108009STEP     0
## 5 uncultured marine archaeon    0.86145         NA108010STEP   155
## 6 uncultured marine archaeon    0.86145         NA108012STEP   677
##                 SAMPLE_AMY Sampletype SAMPLEID     LocationName STATUS
## 1 Niskin Plume Mt. Edwards    in situ sterivex Mt Edwards Plume   keep
## 2 Niskin Plume Mt. Edwards    in situ sterivex Mt Edwards Plume   keep
## 3               Niskin BSW    in situ sterivex    Deep seawater   keep
## 4  SUPR Filter Mt. Edwards    in situ     SUPR  Mt Edwards Vent   keep
## 5  SUPR Filter Mt. Edwards    in situ     SUPR  Mt Edwards Vent   keep
## 6     SUPR BAG Mt. Edwards    in situ     SUPR  Mt Edwards Vent   keep
##   FLUIDORIGIN NEW_SAMPLEID                     Tax_update
## 1     Niskin7     Plume001 Thaumarchaeota-Nitrososphaeria
## 2     Niskin7     Plume001 Thaumarchaeota-Nitrososphaeria
## 3    Niskin10       BSW003 Thaumarchaeota-Nitrososphaeria
## 4          S1      Vent009 Thaumarchaeota-Nitrososphaeria
## 5          S2      Vent010 Thaumarchaeota-Nitrososphaeria
## 6          S7      Vent012 Thaumarchaeota-Nitrososphaeria
bac_wcuratedtax$LOCATION <- factor(bac_wcuratedtax$LocationName, levels = c("Shallow seawater", "Deep seawater", "Mt Edwards Plume", "Mt Edwards Vent", "Venti Latte Vent", "SirVentsAlot Vent", "Candelabra Vent"))
relabun_16s <- bac_wcuratedtax %>%
  #Sum by sample
  group_by(LOCATION) %>%
  mutate(total = sum(value)) %>%
  mutate(RelAbun = 100*(value/total)) %>%
  data.frame
# # head(relabun_16s)
# Interactive plot
relabun_16s %>% plot_ly(x = ~RelAbun, 
                            y = ~LOCATION, 
                            color = ~Tax_update,
                            name = ~Tax_update,
                            type = 'bar', 
                            orientation = 'h',
                            colors = tax_color,
                            text = ~Tax_update) %>% 
  layout(barmode = "stack")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

5 Ordination analyses

5.1 18S-derived data

load("data-input/GordaRidge-ASVtable-avg-19-08-2020.RData")
library(reshape2); 
library(vegan); 
library(dplyr)
library(ade4); 
library(compositions); 
library(tidyverse)
library(purrr)
library(cluster)
library(RColorBrewer)
library(ape)
# Remove controls
gr_counts_allsamples <- subset(counts_decont, !(SAMPLEID == "CTRL"))
gr_nums <- gr_counts_allsamples[, c("Feature.ID", "SAMPLE", "COUNT")]
tax <- gr_counts_allsamples[, c("Feature.ID", "Taxon_updated")]
# unique(gr_nums$SAMPLE)
gr_nums_filter <- subset(gr_nums, !(SAMPLE == "GordaRidge_BSW020_sterivex_2019_REPa"))
gr_nums_wide <- dcast(gr_nums_filter, Feature.ID~SAMPLE, fill=0)
## Using COUNT as value column: use value.var to override.
row.names(gr_nums_wide)<-gr_nums_wide$Feature.ID; gr_nums_wide$Feature.ID<-NULL

5.1.1 Data transformation-Jaccard

# Jaccard distance
jac_gr_mat <- vegdist(t(gr_nums_wide), method = "jaccard")

# PCoA
pcoa_jac_gr <- pcoa(jac_gr_mat)

Jaccard distance transformation was done on the raw sequences reads. Below is a diagnostic scree plot to determine the appropriate number of axes.

# Extract variances from pcoa, from jaccard calculated dist. metric
jac_var_gr <- data.frame(pcoa_jac_gr$values$Relative_eig) %>% 
  select(PercVar = 'pcoa_jac_gr.values.Relative_eig') %>% 
  rownames_to_column(var = "PCaxis") %>% 
  data.frame
head(jac_var_gr[1:4,]) #Look at first 4 axes
##   PCaxis    PercVar
## 1      1 0.11270615
## 2      2 0.08524318
## 3      3 0.07306333
## 4      4 0.06505124
# Screeplot check
ggplot(jac_var_gr, aes(x = as.numeric(PCaxis), y = PercVar)) + 
  geom_bar(stat = "identity", fill = "grey", color = "black") +
  theme_minimal() +
  theme(axis.title = element_text(color = "black", face = "bold", size = 10),
        axis.text.y = element_text(color = "black", face = "bold"),
        axis.text.x = element_blank()) +
  labs(x = "PC axis", y = "% Variance", title = "Log-Ratio PCoA Screeplot")

5.1.2 MDS and extract data for plot

# We want to show data in 2 axes - 
jac_nmds_gr<-metaMDS(jac_gr_mat,k=2,autotransform=FALSE)
## Run 0 stress 0.1774623 
## Run 1 stress 0.2193465 
## Run 2 stress 0.1755825 
## ... New best solution
## ... Procrustes: rmse 0.124748  max resid 0.4205996 
## Run 3 stress 0.1775817 
## Run 4 stress 0.2148195 
## Run 5 stress 0.1774623 
## Run 6 stress 0.1775817 
## Run 7 stress 0.1775817 
## Run 8 stress 0.207624 
## Run 9 stress 0.1775817 
## Run 10 stress 0.1774623 
## Run 11 stress 0.2138939 
## Run 12 stress 0.1755826 
## ... Procrustes: rmse 8.847588e-05  max resid 0.0003193724 
## ... Similar to previous best
## Run 13 stress 0.1755825 
## ... Procrustes: rmse 3.223444e-05  max resid 9.831384e-05 
## ... Similar to previous best
## Run 14 stress 0.2482208 
## Run 15 stress 0.1667834 
## ... New best solution
## ... Procrustes: rmse 0.1387968  max resid 0.405167 
## Run 16 stress 0.1667834 
## ... New best solution
## ... Procrustes: rmse 5.706342e-05  max resid 0.0001853986 
## ... Similar to previous best
## Run 17 stress 0.1636427 
## ... New best solution
## ... Procrustes: rmse 0.1173622  max resid 0.4152786 
## Run 18 stress 0.1878499 
## Run 19 stress 0.1878508 
## Run 20 stress 0.2172173 
## *** No convergence -- monoMDS stopping criteria:
##     20: stress ratio > sratmax
jac_nmds_gr$stress
## [1] 0.1636427
ventnames <- read.delim("data-input/ventnames-gordaridge.txt")
# names(ventnames);head(ventnames)
colnames(ventnames)[1]<-"SAMPLE"

jac_nmds_df <- data.frame(jac_nmds_gr$points) %>% 
    rownames_to_column(var = "SAMPLE") %>% 
    separate(SAMPLE, into = c("DATASET", "LOCATION_SPECIFIC", "SAMPLEID", "YEAR", "REP"), sep = "_", remove = FALSE) %>% 
    left_join(ventnames) %>% 
    data.frame
## Warning: Expected 5 pieces. Missing pieces filled with `NA` in 16 rows [2, 10,
## 11, 12, 13, 17, 18, 19, 22, 23, 24, 25, 26, 27, 28, 29].
## Joining, by = c("SAMPLE", "LOCATION_SPECIFIC", "SAMPLEID")
# Factoring
sample_order <- c('Mt Edwards Plume','Mt Edwards Vent','Candelabra Vent','SirVentsAlot Vent','Venti Latte Vent','Deep seawater', 'Shallow seawater')
sample_color <-c("lightblue","#377eb8", "#e41a1c", "#4daf4a", "#984ea3", "orange", "yellow")
jac_nmds_df$SAMPLE_ORDER <- factor(jac_nmds_df$LocationName, levels = rev(sample_order))
names(sample_color)<-sample_order
# head(jac_nmds_df)

5.1.3 18S MDS plot

library(ggrepel)
# options(repr.plot.width = 8, repr.plot.height = 6)

# svg("PCA-wlabels-gr-22-07-2020.svg", w=8, h=8)
nmds_jac_18s <- ggplot(jac_nmds_df, aes(x = MDS1, y = MDS2, 
                                   fill = LocationName,
                                   shape = Sampletype, 
                                   label = LOCATION_SPECIFIC)) + #Replace label=SAMPLEID.y
  geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
  geom_point(aes(x = MDS1, y = MDS2, fill = LocationName), size = 5) +
  scale_fill_manual(values = sample_color) +
#   labs(x = "NMDS 1", y = "NMDS 2", title = "Jaccard Distance NMDS") +
  labs(x = "NMDS 1", y = "NMDS 2", title = "") +
  scale_shape_manual(values = c(23,21))+
  theme_bw()+
  geom_text_repel()+
  theme(axis.text = element_text(color="black", size=12),
        legend.title = element_blank())+
  guides(fill = guide_legend(override.aes=list(shape=21)))
nmds_jac_18s # dev.off()

5.1.4 Data transformation-CLR

# # Transform data - with CLR
# # Log-ratio
log_rats<-data.frame(compositions::clr(t(gr_nums_wide)))

# # look at eigenvalues
pca_lr <- prcomp(log_rats)
variance_lr <- (pca_lr$sdev^2)/sum(pca_lr$sdev^2)
# head(variance_lr)
barplot(variance_lr,
        main='Log-Ratio PCA Screeplot',
        xlab='PC Axis',
        ylab='% Variance',
        cex.names=1.5,cex.axis=1.5,cex.lab=1.5,cex.main=1.5)

> Based on this screeplot - 2 axis are OK, as they show 0.079 and 0.077, respectively, of the variance.

5.1.5 Plot PCA

# # Extract PCA and plot
pca_lr_frame<-data.frame(pca_lr$x,SAMPLE=rownames(pca_lr$x))
# # head(pca_lr_frame)
x <- colsplit(pca_lr_frame$SAMPLE, "_", c("DATASET", "LOCATION", "SAMPLEID", "YEAR", "REP"));
pca_lr_frame <- data.frame(x, pca_lr_frame)
# head(pca_lr_frame)
ventnames <- read.delim("data-input/ventnames-gordaridge.txt")
# names(ventnames);head(ventnames)
# # View(ventnames)
colnames(ventnames)[1]<-"SAMPLE"
# #
pca_lr_frame_wNames <- left_join(pca_lr_frame, ventnames, by="SAMPLE")
# head(pca_lr_frame_wNames)
# #
# unique(pca_lr_frame_wNames$LocationName)
sample_order <- c('Mt Edwards Plume','Mt Edwards Vent','Candelabra Vent','SirVentsAlot Vent','Venti Latte Vent','Deep seawater', 'Shallow seawater')
sample_color <-c("lightblue","#377eb8", "#e41a1c", "#4daf4a", "#984ea3", "orange", "yellow")
pca_lr_frame_wNames$SAMPLE_ORDER <- factor(pca_lr_frame_wNames$LocationName, levels = rev(sample_order))
names(sample_color)<-sample_order
# head(pca_lr_frame_wNames)
library(ggrepel)
# options(repr.plot.width = 8, repr.plot.height = 8)

# svg("PCA-wlabels-gr-22-07-2020.svg", w=8, h=8)
pca_18s <- ggplot(pca_lr_frame_wNames, aes(x=PC1,y=PC2,fill=LocationName,shape=Sampletype, label=LOCATION_SPECIFIC))+ #Replace label=SAMPLEID.y
  geom_point(aes(x=PC1,y=PC2,fill=LocationName), size=5)+
  scale_fill_manual(values = sample_color)+
  ylab(paste0('PC2 ',round(variance_lr[2]*100,2),'%'))+
  xlab(paste0('PC1 ',round(variance_lr[1]*100,2),'%'))+
  scale_shape_manual(values = c(23,21))+
  ggtitle('CLR PCA Ordination')+
#   coord_fixed(ratio=variance_lr[2]/variance_lr[1])+
  theme_bw()+
  geom_text_repel()+
  theme(axis.text = element_text(color="black", size=12))+
  geom_hline(yintercept = 0) + geom_vline(xintercept = 0) # dev.off()

5.2 16S-derived data

5.2.1 Data transformation

# Import and process 16S ASV table
ventnames_16 <- read.delim("data-input/ventnames-gordaridge-16S.txt")
countbac <- read.delim("data-input/CountTable-wtax-16s-plus3-2020-06-23.txt")
countbac_df <- countbac %>%
  separate(Taxon, sep = ";D_[[:digit:]]__", into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), remove = TRUE, extra = "merge") %>% # Warnings are OK with NAs
  mutate_if(is.character, str_replace_all, pattern = "D_0__", replacement = "") %>%
  column_to_rownames(var = "Feature.ID") %>%
  data.frame
# Warnings are OK because not all levels fill out taxa spaces
# head(countbac_df[1:3,])
# Total ASVs and sequences:
count_summary <- countbac_df %>%
  select(starts_with("NA")) %>% 
  data.frame

## Create new column in ASV table that reports if an ASV is present in any of the background seawater samples
# head(countbac_df[1:2,]); names(countbac_df)
### Select BSW only samples and make character list:
bsw_samples <- ventnames_16 %>% 
  filter(LocationName == "Deep seawater" | LocationName == "Shallow seawater") %>% 
  select(SAMPLEID_16S) %>%
  data.frame
bsw_samples <- as.character(unique(bsw_samples$SAMPLEID_16S))
# bsw_samples
# Create new df with bsw ASVs labeled
sub_bsw_df <- countbac_df %>%
  rownames_to_column(var = "Feature.ID") %>% 
  select(Feature.ID, all_of(bsw_samples)) %>%
  mutate(sum = rowSums(select_if(., is.numeric))) %>% 
  filter(sum > 0) %>% 
  add_column(bsw_presence = "present in bsw") %>% 
  select(Feature.ID, bsw_presence) %>% 
  data.frame
# head(sub_bsw_df)
# unique(sub_bsw_df$bsw_presence)

# Add to original ASV table
countbac_df_cat <- countbac_df %>% 
  rownames_to_column(var = "Feature.ID") %>% 
  left_join(sub_bsw_df) %>%
  mutate(bsw_presence = replace_na(bsw_presence, "vent only")) %>% 
  select(-Confidence) %>% 
  mutate(ASV_SUM_ALL = rowSums(select_if(., is.numeric))) %>% 
  mutate(ASV_REL_ABUN = 100*(ASV_SUM_ALL/sum(ASV_SUM_ALL))) %>%
  # filter(ASV_REL_ABUN > ##) %>%  #Filter by rel abun
  data.frame
# head(countbac_df_cat)
# Remove samples that were repeated
rm <- c("NA108003STEP", "NA108039STEP", "NA108087STEP")

# Filtering count table data:
df_start <- countbac_df_cat %>% 
  filter(Domain %in% "Archaea" | Domain %in% "Bacteria") %>% #Select only archaea and bacteria, removing unassigned
  select(-all_of(rm)) %>% # Remove samples we are replacing
  # Could include other filtering options if necessary
  select(Feature.ID, starts_with("NA108")) %>% 
  data.frame

# Prep data so it is ready for data transformations and more
df_format <- df_start %>% 
  column_to_rownames(var = "Feature.ID") %>% 
  as.matrix
# class(df_format)

ventnames_16_mod <- ventnames_16 %>% 
    mutate(location = case_when(
        grepl("Plume", SAMPLE_AMY) ~ "Plume",
        grepl("BSW", SAMPLE_AMY) ~ "BSW", 
        grepl("Vent", LocationName) ~ "Vent"),
          NA_NUM = SAMPLEID_16S) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "NA108", "")) %>%
    mutate(NA_NUM = str_replace(NA_NUM, "NA080", "")) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "aSTEP", "")) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "bSTEP", "")) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "STEP20200226", "")) %>% 
    mutate(NA_NUM = str_replace(NA_NUM, "STEP", "")) %>% 
    unite(NEW_SAMPLEID, location, NA_NUM, sep ="") %>% 
    data.frame
# ventnames_16_mod

5.2.2 Data transformation-Jaccard

# NMDS - Jaccard distance calculation
jac_gr_16s <- vegdist(t(df_format))

# Ordination - PCA
pcoa_jac_16s <- pcoa(jac_gr_16s)

# Extract variances from pcoa, from jaccard calculated dist. metric
jac_var_16s <- data.frame(pcoa_jac_16s$values$Relative_eig) %>% 
  select(PercVar = 'pcoa_jac_16s.values.Relative_eig') %>% 
  rownames_to_column(var = "PCaxis") %>% 
  data.frame
# head(jac_var_16s[1:4,]) #Look at first 4 axes
# Screeplot check
ggplot(jac_var_16s, aes(x = as.numeric(PCaxis), y = PercVar)) + 
  geom_bar(stat = "identity", fill = "grey", color = "black") +
  theme_minimal() +
  theme(axis.title = element_text(color = "black", face = "bold", size = 10),
        axis.text.y = element_text(color = "black", face = "bold"),
        axis.text.x = element_blank()) +
  labs(x = "PC axis", y = "% Variance", title = "Log-Ratio PCoA Screeplot")

5.2.3 MDS

jac_nmds_16s <- metaMDS(jac_gr_16s,k=2,autotransform=FALSE)
## Run 0 stress 0.1330027 
## Run 1 stress 0.1325471 
## ... New best solution
## ... Procrustes: rmse 0.06952486  max resid 0.2841175 
## Run 2 stress 0.1325477 
## ... Procrustes: rmse 0.0002315351  max resid 0.0006870596 
## ... Similar to previous best
## Run 3 stress 0.1221584 
## ... New best solution
## ... Procrustes: rmse 0.1042181  max resid 0.4838947 
## Run 4 stress 0.1220955 
## ... New best solution
## ... Procrustes: rmse 0.005784477  max resid 0.0202706 
## Run 5 stress 0.1135523 
## ... New best solution
## ... Procrustes: rmse 0.07859589  max resid 0.3404599 
## Run 6 stress 0.1632819 
## Run 7 stress 0.133565 
## Run 8 stress 0.1219904 
## Run 9 stress 0.1135523 
## ... New best solution
## ... Procrustes: rmse 1.320668e-05  max resid 3.740674e-05 
## ... Similar to previous best
## Run 10 stress 0.1135975 
## ... Procrustes: rmse 0.009193881  max resid 0.03432903 
## Run 11 stress 0.145132 
## Run 12 stress 0.1392012 
## Run 13 stress 0.1335651 
## Run 14 stress 0.1518077 
## Run 15 stress 0.1392441 
## Run 16 stress 0.1248836 
## Run 17 stress 0.1300336 
## Run 18 stress 0.1301932 
## Run 19 stress 0.1454087 
## Run 20 stress 0.1135975 
## ... Procrustes: rmse 0.009204767  max resid 0.0344076 
## *** Solution reached
jac_nmds_16s$stress
## [1] 0.1135523
# Make dataframe of PCoA variables and import vent names
sample_order <- c('Mt Edwards Plume','Mt Edwards Vent','Candelabra Vent','SirVentsAlot Vent','Venti Latte Vent','Deep seawater', 'Shallow seawater')

jac_nmds_df_16s <- data.frame(jac_nmds_16s$points) %>% 
    rownames_to_column(var = "SAMPLEID_16S") %>% 
    left_join(ventnames_16_mod) %>% 
    filter(LocationName %in% sample_order) %>% 
  data.frame
## Joining, by = "SAMPLEID_16S"

5.2.4 Plot MDS

sample_order <- c('Mt Edwards Plume','Mt Edwards Vent','Candelabra Vent','SirVentsAlot Vent','Venti Latte Vent','Deep seawater', 'Shallow seawater')
sample_color <-c("lightblue","#377eb8", "#e41a1c", "#4daf4a", "#984ea3", "orange", "yellow")
jac_nmds_df_16s$SAMPLE_ORDER <- factor(jac_nmds_df_16s$LocationName, levels = rev(sample_order))
names(sample_color)<-sample_order

library(ggrepel)
# head(jac_nmds_df_16s[1:3,])

options(repr.plot.width = 8, repr.plot.height = 6)

# svg("PCA-wlabels-gr-22-07-2020.svg", w=8, h=8)
nmds_jac_16s <- ggplot(jac_nmds_df_16s, aes(x = MDS1, y = MDS2, 
                                   fill = LocationName,
                                   label = NEW_SAMPLEID)) + #Replace label=SAMPLEID.y
  geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
  geom_point(aes(x = MDS1, y = MDS2, fill = LocationName), size = 5, shape = 21) +
  scale_fill_manual(values = sample_color) +
#   labs(x = "NMDS 1", y = "NMDS 2", title = "Jaccard Distance NMDS (16S)") +
  labs(x = "NMDS 1", y = "NMDS 2", title = "") +
  theme_bw()+
  geom_text_repel()+
  theme(axis.text = element_text(color="black", size=12),
       legend.title = element_blank()) +
  guides(fill = guide_legend(override.aes=list(shape=21)))
nmds_jac_16s # dev.off()

5.2.5 Data transformation-CLR

# # Log-Ratio
# library(compositions)
df_log_clr <- data.frame(clr(t(df_format)))

# # Ordination - PCA
# # ?prcomp()
pca_clr <- prcomp(df_log_clr)

# # Check variance
check_variance <- (pca_clr$sdev^2)/sum(pca_clr$sdev^2)
# head(check_variance)
# # Screeplot, how many axes are appropriate?
barplot(check_variance,
        main='Log-Ratio PCA Screeplot',
        xlab='PC Axis',
        ylab='% Variance',
        cex.names=1.5,cex.axis=1.5,cex.lab=1.5,cex.main=1.5)

sample_order <- c('Mt Edwards Plume','Mt Edwards Vent','Candelabra Vent','SirVentsAlot Vent','Venti Latte Vent','Deep seawater', 'Shallow seawater')
df_pca_clr <- data.frame(pca_clr$x, SAMPLEID_16S = rownames(pca_clr$x))

df_pca_clr_wnames <- df_pca_clr %>%
  left_join(ventnames_16_mod) %>%
  filter(LocationName %in% sample_order) %>%
  data.frame
## Joining, by = "SAMPLEID_16S"

5.2.6 Plot PCA

sample_order <- c('Mt Edwards Plume','Mt Edwards Vent','Candelabra Vent','SirVentsAlot Vent','Venti Latte Vent','Deep seawater', 'Shallow seawater')
sample_color <- c("lightblue","#377eb8", "#e41a1c", "#4daf4a", "#984ea3", "orange", "yellow")
df_pca_clr_wnames$SAMPLE_ORDER <- factor(df_pca_clr_wnames$LocationName, levels = rev(sample_order))
names(sample_color)<-sample_order

library(ggrepel)
# options(repr.plot.width = 8, repr.plot.height = 6)

# svg("PCoA-16S-wolabels.svg", h = 8, w = 8)
pca_16s <- ggplot(df_pca_clr_wnames, aes(x = PC1, y = PC2, fill = SAMPLE_ORDER, label = NEW_SAMPLEID)) +
  geom_point(aes(x = PC1,y = PC2, fill = LocationName), size = 4, shape = 21) +
  scale_fill_manual(values = sample_color) +
  ylab(paste0('PC2 ',round(check_variance[2]*100,2),'%')) +
  xlab(paste0('PC1 ',round(check_variance[1]*100,2),'%')) +
  ggtitle('16S - CLR PCA Ordination') +
  geom_text_repel() +
  theme_bw() +
  theme(axis.text = element_text(color = "black", size = 12)) +
  geom_hline(yintercept = 0) + geom_vline(xintercept = 0)

# pca_16s

5.3 Plot 18S & 16S MDS

# options(repr.plot.width = 15, repr.plot.height = 6)
nmdsleg <- get_legend(nmds_jac_18s)
# svg("panel-NMDS.svg", w = 15, h = 6)
plot_grid(nmds_jac_18s + theme(legend.position = "none"), 
          nmds_jac_16s + theme(legend.position = "none"), 
          nmdsleg, nrow = 1, rel_widths = c(1,1,0.5), 
          labels = c("a. 18S", "b. 16S", ""))

# dev.off()

5.4 Plot 18S & 16S PCA

# options(repr.plot.width = 15, repr.plot.height = 6)

# svg("panel-ordinations.svg", w = 15, h = 6)
plot_grid(pca_18s, pca_16s, nrow = 1, labels = c("a", "b")) # dev.off()

6 Classify 18S ASVs by distribution

load("data-input/GordaRidge-ASVtable-avg-19-08-2020.RData",verbose = T)
## Loading objects:
##   gr_counts_filter
##   gr_counts_wtax
##   gr_counts_avg_wtax
unique(gr_counts_avg_wtax[, c("Sampletype", "LocationName")]) #categories to consider
##     Sampletype      LocationName
## 1      in situ  Shallow seawater
## 3      in situ  Venti Latte Vent
## 4      in situ   Candelabra Vent
## 5      Grazing SirVentsAlot Vent
## 6      in situ     Deep seawater
## 8      in situ  Mt Edwards Plume
## 12     in situ SirVentsAlot Vent
## 13     Grazing  Mt Edwards Plume
## 17     in situ   Mt Edwards Vent
## 36     Grazing   Mt Edwards Vent
## 40     Grazing   Candelabra Vent
## 49     Grazing  Venti Latte Vent
## 135    Control         Lab blank
bin_category <- gr_counts_avg_wtax %>% 
  filter(!(Sampletype == "Control")) %>% 
  select(Feature.ID, Sampletype, LocationName, COUNT_AVG) %>% 
  data.frame

bin_category$sample_tmp <- paste(bin_category$Sampletype, bin_category$LocationName, sep = "_")
bin_category$sample_tmp <- gsub(" ","_", bin_category$sample_tmp)
# head(bin_category)
bin_category$Sampletype<-NULL; bin_category$LocationName <- NULL
# head(bin_category)
#
library(reshape2)
bin_wide <- dcast(bin_category, Feature.ID ~ sample_tmp) #aggregate to length is OK - counts occurences

6.1 ID ASVs based on conditions

shared_all <- bin_wide %>% 
  filter((in_situ_Shallow_seawater > 0 & 
            in_situ_Deep_seawater > 0 &
            in_situ_Mt_Edwards_Plume > 0 &
            Grazing_Mt_Edwards_Plume > 0 &
            in_situ_Mt_Edwards_Vent > 0 &
            Grazing_Mt_Edwards_Vent > 0 &
            in_situ_Venti_Latte_Vent > 0 &
            Grazing_Venti_Latte_Vent > 0 &
            in_situ_Candelabra_Vent > 0 &
            Grazing_Candelabra_Vent > 0 &
            in_situ_SirVentsAlot_Vent > 0 &
            Grazing_SirVentsAlot_Vent > 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY = "ASV present in all samples") %>%
  data.frame
# dim(shared_all)
asv_key <- shared_all
#
bsw_only <- bin_wide %>% 
  filter((in_situ_Shallow_seawater > 0 & 
            in_situ_Deep_seawater > 0 &
            in_situ_Mt_Edwards_Plume == 0 &
            Grazing_Mt_Edwards_Plume == 0 &
            in_situ_Mt_Edwards_Vent == 0 &
            Grazing_Mt_Edwards_Vent == 0 &
            in_situ_Venti_Latte_Vent == 0 &
            Grazing_Venti_Latte_Vent == 0 &
            in_situ_Candelabra_Vent == 0 &
            Grazing_Candelabra_Vent == 0 &
            in_situ_SirVentsAlot_Vent == 0 &
            Grazing_SirVentsAlot_Vent == 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY = "BSW only") %>%
  data.frame
# head(bsw_only); dim(bsw_only)
#
asv_key <- rbind(asv_key, bsw_only)
# dim(asv_key)
#
shared_edwards <- bin_wide %>% 
  filter((in_situ_Shallow_seawater == 0 & 
            in_situ_Deep_seawater == 0 &
            in_situ_Mt_Edwards_Plume == 0 &
            Grazing_Mt_Edwards_Plume == 0 &
            in_situ_Mt_Edwards_Vent > 0 &
            Grazing_Mt_Edwards_Vent > 0 &
            in_situ_Venti_Latte_Vent == 0 &
            Grazing_Venti_Latte_Vent == 0 &
            in_situ_Candelabra_Vent == 0 &
            Grazing_Candelabra_Vent == 0 &
            in_situ_SirVentsAlot_Vent == 0 &
            Grazing_SirVentsAlot_Vent == 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY = "Shared between in situ vent and grazing exp") %>%
  data.frame
# head(shared_edwards); dim(shared_edwards)
#
shared_edwards_wplume <- bin_wide %>% 
  filter((in_situ_Shallow_seawater == 0 & 
            in_situ_Deep_seawater == 0 &
            in_situ_Mt_Edwards_Plume > 0 &
            Grazing_Mt_Edwards_Plume > 0 &
            in_situ_Mt_Edwards_Vent == 0 &
            Grazing_Mt_Edwards_Vent == 0 &
            in_situ_Venti_Latte_Vent == 0 &
            Grazing_Venti_Latte_Vent == 0 &
            in_situ_Candelabra_Vent == 0 &
            Grazing_Candelabra_Vent == 0 &
            in_situ_SirVentsAlot_Vent == 0 &
            Grazing_SirVentsAlot_Vent == 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY = "Shared between in situ vent and grazing exp") %>%
  data.frame
# head(shared_edwards_wplume); dim(shared_edwards_wplume)
#
shared_ventilatte <- bin_wide %>% 
  filter((in_situ_Shallow_seawater == 0 & 
            in_situ_Deep_seawater == 0 &
            in_situ_Mt_Edwards_Plume == 0 &
            Grazing_Mt_Edwards_Plume == 0 &
            in_situ_Mt_Edwards_Vent == 0 &
            Grazing_Mt_Edwards_Vent == 0 &
            in_situ_Venti_Latte_Vent > 0 &
            Grazing_Venti_Latte_Vent > 0 &
            in_situ_Candelabra_Vent == 0 &
            Grazing_Candelabra_Vent == 0 &
            in_situ_SirVentsAlot_Vent == 0 &
            Grazing_SirVentsAlot_Vent == 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY = "Shared between in situ vent and grazing exp") %>%
  data.frame
# dim(shared_ventilatte)
#
shared_candelabra <- bin_wide %>% 
  filter((in_situ_Shallow_seawater == 0 & 
            in_situ_Deep_seawater == 0 &
            in_situ_Mt_Edwards_Plume == 0 &
            Grazing_Mt_Edwards_Plume == 0 &
            in_situ_Mt_Edwards_Vent == 0 &
            Grazing_Mt_Edwards_Vent == 0 &
            in_situ_Venti_Latte_Vent == 0 &
            Grazing_Venti_Latte_Vent == 0 &
            in_situ_Candelabra_Vent > 0 &
            Grazing_Candelabra_Vent > 0 &
            in_situ_SirVentsAlot_Vent == 0 &
            Grazing_SirVentsAlot_Vent == 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY = "Shared between in situ vent and grazing exp") %>%
  data.frame
# dim(shared_candelabra)
#
shared_Sirventsalot <- bin_wide %>% 
  filter((in_situ_Shallow_seawater == 0 & 
            in_situ_Deep_seawater == 0 &
            in_situ_Mt_Edwards_Plume == 0 &
            Grazing_Mt_Edwards_Plume == 0 &
            in_situ_Mt_Edwards_Vent == 0 &
            Grazing_Mt_Edwards_Vent == 0 &
            in_situ_Venti_Latte_Vent == 0 &
            Grazing_Venti_Latte_Vent == 0 &
            in_situ_Candelabra_Vent == 0 &
            Grazing_Candelabra_Vent == 0 &
            in_situ_SirVentsAlot_Vent > 0 &
            Grazing_SirVentsAlot_Vent > 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY = "Shared between in situ vent and grazing exp") %>%
  data.frame
tmp_bin <- bin_wide; tmp_bin$SUM <- rowSums(tmp_bin[, 2:13])
tmp_bin2 <- subset(tmp_bin, SUM == 1)
tmp_bin2$CATEGORY <- "Unique to sample"
# head(tmp_bin2)
binned_unique <- tmp_bin2[, c("Feature.ID", "CATEGORY")]
length(unique(binned_unique$Feature.ID)); dim(binned_unique)
## [1] 6807
## [1] 6807    2
#
asv_key_2 <- rbind(asv_key, shared_edwards, shared_edwards_wplume, shared_ventilatte, shared_candelabra, shared_Sirventsalot, binned_unique)
dim(asv_key_2); length(unique(asv_key_2$Feature.ID))
## [1] 7004    2
## [1] 7004
appears_vent_plume <- bin_wide %>% 
  filter((in_situ_Shallow_seawater == 0 & 
            in_situ_Deep_seawater == 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY_appear = "Appears in situ vent or plume (not BSW)") %>%
  data.frame
dim(appears_vent_plume)
## [1] 7360    2
#
appears_vent <- bin_wide %>% 
  filter((in_situ_Shallow_seawater == 0 & 
            in_situ_Deep_seawater == 0 &
            in_situ_Mt_Edwards_Plume == 0 &
            Grazing_Mt_Edwards_Plume == 0)) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY_appear = "Appears in situ vent (not BSW or plume)") %>%
  data.frame
dim(appears_vent)
## [1] 4832    2
#
appears_deep <- bin_wide %>% 
  filter(((in_situ_Shallow_seawater == 0 & 
             in_situ_Deep_seawater > 0) &
            in_situ_Mt_Edwards_Plume > 0 |
            # Grazing_Mt_Edwards_Plume == 0 &
            in_situ_Mt_Edwards_Vent > 0 |
            # Grazing_Mt_Edwards_Vent == 0 &
            in_situ_Venti_Latte_Vent > 0 |
            # Grazing_Venti_Latte_Vent == 0 &
            in_situ_Candelabra_Vent > 0 |
            # Grazing_Candelabra_Vent == 0 &
            in_situ_SirVentsAlot_Vent > 0 #&
          # Grazing_SirVentsAlot_Vent == 0
  )) %>%
  select(Feature.ID) %>%
  distinct() %>%
  add_column(CATEGORY_appear = "Appears in deep BSW and among vent/plume sites") %>%
  data.frame
dim(appears_deep)
## [1] 4969    2
# Regardless of grazing experiment presence:
# Add in this order, will overwrite the previous entry.
dim(appears_deep) # ASV appears BSW and vent and plume
## [1] 4969    2
dim(appears_vent_plume) # no BSW, among plume and vent
## [1] 7360    2
dim(appears_vent) # no BSW or plume, among vent
## [1] 4832    2
#
dim(asv_key_2); unique(asv_key_2$CATEGORY)
## [1] 7004    2
## [1] "ASV present in all samples"                 
## [2] "BSW only"                                   
## [3] "Shared between in situ vent and grazing exp"
## [4] "Unique to sample"
#
head(bin_wide)
##                         Feature.ID Grazing_Candelabra_Vent
## 1 0009645516609bda2246e1955ff9ec1d                       0
## 2 0030ad8ce44f257c42daf3673bf92197                       0
## 3 0038478be7fb4f097ce93a5e9341af2a                       0
## 4 003b5938e31a8c1b1809e0358da894e0                       0
## 5 003ff3e98dff52952a7036585a32c2f2                       0
## 6 004ebe8047915b78deefc412bef467b7                       0
##   Grazing_Mt_Edwards_Plume Grazing_Mt_Edwards_Vent Grazing_SirVentsAlot_Vent
## 1                        0                       0                         0
## 2                        0                       0                         1
## 3                        3                       0                         0
## 4                        0                       0                         0
## 5                        1                       0                         0
## 6                        0                       0                         0
##   Grazing_Venti_Latte_Vent in_situ_Candelabra_Vent in_situ_Deep_seawater
## 1                        0                       0                     0
## 2                        0                       1                     0
## 3                        0                       0                     1
## 4                        0                       0                     0
## 5                        0                       0                     0
## 6                        0                       1                     0
##   in_situ_Mt_Edwards_Plume in_situ_Mt_Edwards_Vent in_situ_Shallow_seawater
## 1                        0                       0                        1
## 2                        0                       0                        1
## 3                        3                       0                        1
## 4                        0                       0                        1
## 5                        0                       1                        0
## 6                        0                       0                        0
##   in_situ_SirVentsAlot_Vent in_situ_Venti_Latte_Vent
## 1                         0                        0
## 2                         0                        1
## 3                         1                        1
## 4                         0                        0
## 5                         0                        0
## 6                         0                        0
length(unique(bin_wide$Feature.ID)); dim(bin_wide)
## [1] 9028
## [1] 9028   13
#
# New data frame
asv_key_all <- data.frame(bin_wide[,c("Feature.ID")])
colnames(asv_key_all)[1] <- "Feature.ID"
head(asv_key_all)
##                         Feature.ID
## 1 0009645516609bda2246e1955ff9ec1d
## 2 0030ad8ce44f257c42daf3673bf92197
## 3 0038478be7fb4f097ce93a5e9341af2a
## 4 003b5938e31a8c1b1809e0358da894e0
## 5 003ff3e98dff52952a7036585a32c2f2
## 6 004ebe8047915b78deefc412bef467b7
deep <- as.character(unique(appears_deep$Feature.ID))
asv_key_all$SORTED[asv_key_all$Feature.ID %in% deep] = "ASV appears in BSW and throughout vent/plume"
vent_plume <- as.character(unique(appears_vent_plume$Feature.ID))
asv_key_all$SORTED[asv_key_all$Feature.ID %in% vent_plume] = "ASV appears among vent/plume (no BSW)"
vent <- as.character(unique(appears_vent$Feature.ID))
asv_key_all$SORTED[asv_key_all$Feature.ID %in% vent] = "ASV appears among vent (no BSW or plume)"
table(asv_key_all$SORTED)
## 
##     ASV appears among vent (no BSW or plume) 
##                                         4832 
##        ASV appears among vent/plume (no BSW) 
##                                         2528 
## ASV appears in BSW and throughout vent/plume 
##                                          457
#
asv_key_joined <- left_join(asv_key_all, asv_key_2)
# head(asv_key_joined)
asv_key_joined_filled <- data.frame(asv_key_joined, (coalesce(asv_key_joined$CATEGORY, asv_key_joined$SORTED)))
colnames(asv_key_joined_filled)[4] <- "category_final"
table(asv_key_joined_filled$category_final)
## 
##     ASV appears among vent (no BSW or plume) 
##                                          606 
##        ASV appears among vent/plume (no BSW) 
##                                          894 
## ASV appears in BSW and throughout vent/plume 
##                                          446 
##                   ASV present in all samples 
##                                           11 
##                                     BSW only 
##                                            7 
##  Shared between in situ vent and grazing exp 
##                                          179 
##                             Unique to sample 
##                                         6807
# Replace NAs in category_final
asv_key_final <- asv_key_joined_filled[, c("Feature.ID", "category_final")]
# str(asv_key_final)
asv_key_final$category_final <- as.character(asv_key_final$category_fina)
asv_key_final$category_final[is.na(asv_key_final$category_fina)] = "Other"

6.2 Combine with original ASV table

Print report on total ASV counts that fall into each category.

gr_sorted <- left_join(gr_counts_avg_wtax, asv_key_final) %>%
  filter(!(Sampletype == "Control"))

table(gr_sorted$category_final)
## 
##     ASV appears among vent (no BSW or plume) 
##                                         1926 
##        ASV appears among vent/plume (no BSW) 
##                                         5184 
## ASV appears in BSW and throughout vent/plume 
##                                         4358 
##                   ASV present in all samples 
##                                          268 
##                                     BSW only 
##                                           14 
##                                        Other 
##                                          205 
##  Shared between in situ vent and grazing exp 
##                                          546 
##                             Unique to sample 
##                                         6807
# head(gr_sorted)
# Stats
total <- sum(gr_sorted$COUNT_AVG); total
## [1] 1260878
gr_sorted_summary <- gr_sorted %>%
  group_by(category_final) %>%
  summarise(totalasv = n_distinct(Feature.ID), totalseq = sum(COUNT_AVG)) %>%
  mutate(Perc_seq = 100*(totalseq/total)) %>% 
  data.frame
#
# gr_sorted_summary
# sum(gr_sorted_summary$totalasv)

gr_sorted_summary_plots <- gr_sorted %>%
  group_by(category_final, LocationName, SAMPLEID) %>%
  summarise(totalasv = n_distinct(Feature.ID), totalseq = sum(COUNT_AVG)) %>%
  mutate(Perc_seq = 100*(totalseq/total)) %>% 
  data.frame

6.3 Generate table to report at taxa level

gr_stats_wtax <- gr_sorted %>% 
    group_by(Taxa, category_final) %>% 
    summarise(totalasv = n(), totalseq = sum(COUNT_AVG)) %>%
    ungroup() %>% 
    group_by(Taxa, category_final) %>% 
    summarise(totalasvs = sum(totalasv), 
            sumseqs = sum(totalseq)) %>%
    mutate(percentseq = sumseqs/sum(sumseqs)*100) %>% 
    pivot_wider(names_from = Taxa, names_glue = "{Taxa}_{.value}", values_from = c(totalasvs, sumseqs, percentseq)) %>% 
    data.frame
# head(gr_stats_wtax)
# head(gr_sorted)
# write_delim(gr_stats_wtax, path = "Distribution-ASVs-bytax.txt", delim = "\t")
# write_delim(gr_sorted_summary, path = "Distribution-ASVs.txt", delim = "\t")

6.4 Plot ASV & sequence abundance by distribution

Generate bar plot that summarized sequence and ASV abundance by distribution of ASV.

LocationNameOrder<-c("Shallow seawater", "Deep seawater", "Mt Edwards Plume", "Mt Edwards Vent", "Venti Latte Vent", "Candelabra Vent", "SirVentsAlot Vent")
gr_sorted_summary_plots$LOCATION_ORDER <- factor(gr_sorted_summary_plots$LocationName, levels = LocationNameOrder)

library(RColorBrewer)

category_order <- c("ASV present in all samples", "BSW only", "ASV appears in BSW and throughout vent/plume",
                   "ASV appears among vent/plume (no BSW)", "ASV appears among vent (no BSW or plume)",
                    "Shared between in situ vent and grazing exp", "Unique to sample", "Other")
category_color <- c("#003c30","#01665e", "#80cdc1",
                   "#c51b7d", "#de77ae", "#f1b6da",
                   "#fdb863", "#b35806")
gr_sorted_summary_plots$CATEGORY_ORDER <- factor(gr_sorted_summary_plots$category_final, levels = category_order)
names(category_color) <- category_order
totalseq <- ggplot(gr_sorted_summary_plots, aes(x=SAMPLEID, y=totalseq, fill=CATEGORY_ORDER)) +
  geom_bar(stat = "identity", color = "black", position = "fill") + 
#   scale_fill_brewer(palette = "Accent") +
  scale_fill_manual(values = category_color) +
  scale_y_continuous(expand = c(0, 0)) +
  facet_grid(. ~ LOCATION_ORDER, space = "free", scales = "free")+
  theme(legend.position = "right",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", face="bold"),
        axis.text.y = element_text(color="black", face="bold"),
        strip.text = element_text(size = 6, angle = 90, vjust = 0, hjust = 1),
        strip.background = element_blank(),
        legend.title = element_blank()) +
  labs(x="", y="")
totalasv <- ggplot(gr_sorted_summary_plots, aes(x=SAMPLEID, y= totalasv, fill=CATEGORY_ORDER)) +
  geom_bar(stat = "identity", color = "black", position = "fill") + 
#   scale_fill_brewer(palette = "Accent") +
  scale_fill_manual(values = category_color) +
  scale_y_continuous(expand = c(0, 0)) +
  facet_grid(. ~ LOCATION_ORDER, space = "free", scales = "free") +
  theme(legend.position = "right",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        panel.background = element_blank(),
        axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", face="bold"),
        axis.text.y = element_text(color="black", face="bold"),
        strip.text = element_text(size = 6, angle = 90, vjust = 0, hjust = 1),
        strip.background = element_blank(),
        legend.title = element_blank()) +
  labs(x="", y="")
# Supplementary figure
# options(repr.plot.width = 15, repr.plot.height = 10)
plot_grid(totalseq, totalasv,
          totalseq + geom_bar(stat = "identity", color = "black", position = "stack"), 
          totalasv + geom_bar(stat = "identity", color = "black", position = "stack"))

6.4.0.1 Re-plot with simplified categories

Sort categories into either “Resident” or “Cosmopolitan”

resident <- c("ASV appears among vent/plume (no BSW)", "ASV appears among vent (no BSW or plume)", "Shared between in situ vent and grazing exp")
cosmo <- c("ASV appears in BSW and throughout vent/plume", "ASV present in all samples")
gr_sorted$RES_COSMO = "Other"
gr_sorted$RES_COSMO[gr_sorted$category_final %in% resident] = "Resident"
gr_sorted$RES_COSMO[gr_sorted$category_final %in% cosmo] = "Cosmopolitan"
# head(gr_sorted)
tmp <- gr_sorted %>% 
  select(Feature.ID, RES_COSMO) %>% 
  distinct()
table(tmp$RES_COSMO) # ASV distribution
## 
## Cosmopolitan        Other     Resident 
##          457         6892         1679
total <- sum(gr_sorted$COUNT_AVG); total
## [1] 1260878
tmp2 <- gr_sorted %>% 
  select(RES_COSMO, COUNT_AVG) %>% 
  group_by(RES_COSMO) %>% 
  summarise(SUM = sum(COUNT_AVG)) %>% 
  mutate(Perc = 100*(SUM/total))
## `summarise()` ungrouping output (override with `.groups` argument)
head(tmp2)
## # A tibble: 3 x 3
##   RES_COSMO        SUM  Perc
##   <chr>          <dbl> <dbl>
## 1 Cosmopolitan 593854. 47.1 
## 2 Other        123331   9.78
## 3 Resident     543694. 43.1
# save(gr_sorted, asv_key_final, file = "data-input/GR-countinfo-withASVdistribution.RData")
# head(gr_sorted)
gr_sorted_bydist <- gr_sorted %>%
  unite("SAMPLE", c("LocationName", "Sampletype", "SAMPLEID"), sep = " ", remove = FALSE) %>%
  group_by(SAMPLE, Sampletype, LocationName, SAMPLEID, RES_COSMO) %>%
  summarise(totalasv = n(), totalseq = sum(COUNT_AVG)) %>%
  data.frame
# head(gr_sorted_bydist)
sample_order_all<-c("Shallow seawater in situ sterivex","Deep seawater in situ sterivex","Mt Edwards Plume in situ sterivex","Mt Edwards Plume Grazing T0","Mt Edwards Plume Grazing T24","Mt Edwards Plume Grazing T36","Mt Edwards Vent in situ SUPR","Mt Edwards Vent Grazing T0","Mt Edwards Vent Grazing T36","Venti Latte Vent in situ SUPR","Venti Latte Vent Grazing T0","Venti Latte Vent Grazing T36","Candelabra Vent in situ SUPR","Candelabra Vent Grazing T24","SirVentsAlot Vent in situ SUPR","SirVentsAlot Vent Grazing T24")
sample_name_all<-c("Shallow seawater","Deep seawater","Mt Edwards Plume","Mt Edwards Plume T0","Mt Edwards Plume T23","Mt Edwards Plume T35","Mt Edwards Vent","Mt Edwards Vent T0","Mt Edwards Vent  T36","Venti Latte Vent","Venti Latte Vent T0","Venti Latte Vent T29","Candelabra Vent","Candelabra Vent T26","SirVentsAlot Vent","SirVentsAlot Vent T24")
gr_sorted_bydist$SAMPLE_ORDER <- factor(gr_sorted_bydist$SAMPLE, levels = sample_order_all, labels = sample_name_all)
#
LocationNameOrder<-c("Shallow seawater", "Deep seawater", "Mt Edwards Plume", "Mt Edwards Vent", "Venti Latte Vent", "Candelabra Vent", "SirVentsAlot Vent")
gr_sorted_bydist$LOCATION_ORDER <- factor(gr_sorted_bydist$LocationName, levels = LocationNameOrder)
unique(gr_sorted_bydist$Sampletype)
## [1] Grazing in situ
## Levels: control Control Grazing in situ
category_order_simple <- c("Cosmopolitan", "Resident", "Other")
category_color_simple <- c("#7fcdbb", "#f768a1", "#fdae61")
gr_sorted_bydist$CATEGORY_ORDER <- factor(gr_sorted_bydist$RES_COSMO, levels = category_order_simple)
names(category_color_simple) <- category_order_simple
filled_seq <- ggplot(gr_sorted_bydist, aes(x = SAMPLE_ORDER, y = totalseq)) +
  geom_bar(stat = "identity", position = "stack", color = "black", 
           alpha = 0.6, aes(fill = CATEGORY_ORDER)) +
  scale_fill_manual(values = category_color_simple) +
  facet_grid(. ~ LOCATION_ORDER, space = "free", scales = "free") +
  scale_y_continuous(expand = c(0,0)) +
  theme(legend.position = "right",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(),
        # panel.border = element_blank(),
        panel.background = element_blank(),
        axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", face="bold"),
        axis.text.y = element_text(color="black", face="bold"),
        strip.text = element_text(size = 5, face = "bold"),
        strip.background = element_blank(),
        legend.title = element_blank()) +
  labs(x="", y="Total sequences")
#
filled_asv <- ggplot(gr_sorted_bydist, aes(x = SAMPLE_ORDER, y= totalasv)) +
  geom_bar(stat = "identity", position = "stack", color = "black", 
           alpha = 0.6, aes(fill = CATEGORY_ORDER)) +
  scale_fill_manual(values = category_color_simple) +
  facet_grid(. ~ LOCATION_ORDER, space = "free", scales = "free") +
  scale_y_continuous(expand = c(0,0)) +
  theme(legend.position = "right",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(),
        # panel.border = element_blank(),
        panel.background = element_blank(),
        axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", face="bold"),
        axis.text.y = element_text(color="black", face="bold"),
        strip.text = element_text(size = 5, face = "bold"),
        strip.background = element_blank(),
        legend.title = element_blank()) +
  labs(x="", y="Total ASVs")
legend<-get_legend(filled_seq)
# svg("dist-seq-asv.svg", h = 5, w = 15)
options(repr.plot.width = 15, repr.plot.height = 5)
plot_grid(filled_seq + theme(legend.position = "none"), 
          filled_asv + theme(legend.position = "none"),
          legend, ncol = 3, nrow = 1, rel_widths = c(1,1,0.5), labels = c("a", "b", ""))

# dev.off()

7 18S bar plots - high resolution

load("data-input/GordaRidge-ASVtable-avg-19-08-2020.RData",verbose=T)
## Loading objects:
##   gr_counts_filter
##   gr_counts_wtax
##   gr_counts_avg_wtax
load("data-input/GR-countinfo-withASVdistribution.RData", verbose=T)
## Loading objects:
##   gr_sorted
##   asv_key_final
# head(gr_sorted)

7.1 Re-curate taxonomic levels (higher resolution)

# Add Taxa2 level
expand_taxa2 <- function(df){
    sumseq <- sum(df$COUNT_AVG)
    # Sum asv totals
    df_tmp <- df %>% group_by(Feature.ID) %>% 
      summarise(SUM_ASV = sum(COUNT_AVG)) %>% 
      mutate(RelAbun = 100*(SUM_ASV/sumseq)) %>% 
      filter(RelAbun > 0.01) %>% 
      data.frame
    topASVs18s <- as.character(unique(df_tmp$Feature.ID)) #Select ASVs > 0.01% of data
    df$Taxa <- as.character(df$Taxa); df$Order <- as.character(df$Order)
    df$Class <- as.character(df$Class); df$Division <- as.character(df$Division)
    non_ciliate <- c("Alveolata-Syndiniales", "Alveolata-Dinoflagellates", "Alveolata-Other")
    order <- c("Alveolata-Syndiniales", "Alveolata-Dinoflagellates")
    class <- c("Alveolata-Ciliates", "Stramenopiles-Ochrophyta", "Stramenopiles-MAST", "Opisthokonta-Metazoa", "Opisthokonta-Fungi", "Opisthokonta-Other")
    df2 <- df %>% 
      mutate(Taxa2 = Taxa) %>% # Duplicate Taxa column
      mutate(Taxa2 = ifelse(Taxa %in% order, Order, Taxa2),
             Taxa2 = ifelse(Taxa %in% class, Class, Taxa2),
             Taxa2 = ifelse(Taxa %in% "Amoebozoa", Division, Taxa2),
             # Curate Ciliates
             Taxa2 = ifelse(Class == "Spirotrichea", paste(Class, Order, sep = "-"), Taxa2),
             Taxa2 = ifelse(grepl("Spirotrichea_", Taxa2), "Spirotrichea-Other", Taxa2),
             Taxa2 = ifelse(grepl("Spirotrichea-NA", Taxa2), "Spirotrichea-Other", Taxa2),
             Taxa2 = ifelse(grepl("Spirotrichea-Strombidiida_", Taxa2), "CONThreeP", Taxa2),
             Taxa2 = ifelse(grepl("CONTH_", Taxa2), "Spirotrichea-Strombidiida", Taxa2),
             # Curate Radiolaria
             Taxa2 = ifelse(Class == "Acantharea", Order, Taxa2),
             Taxa2 = ifelse(Class == "Polycystinea", Order, Taxa2),
             Taxa2 = ifelse(Taxa2 == "Rhizaria-Radiolaria-Other", "Rhizaria-Radiolaria", Taxa2),
             Taxa2 = ifelse(Taxa2 == "Rhizaria-Cercozoa-Other", "Rhizaria-Cercozoa", Taxa2),
             # Add hacrobia resolution
             Taxa2 = ifelse(Taxa2 == "Hacrobia-Other", Division, Taxa2),
             # Add Excavata resolution
             Taxa2 = ifelse(Taxa2 == "Excavata", Division, Taxa2),
             # Curate Stramenopiles
             # Taxa2 = ifelse(Taxa == "Stramenopiles-MAST", Taxa, Taxa2),
             Taxa2 = ifelse(grepl("MOCH-", Taxa2), "MOCH", Taxa2),
             # Curate other unknown - Move low abundance ASVs to "Other"
             Taxa2 = ifelse(grepl("_X", Taxa2), Taxa, Taxa2),
             Taxa2 = ifelse(is.na(Taxa2), Taxa, Taxa2),
             # Taxa2 = ifelse(!(Feature.ID %in% topASVs18s), paste(Taxa, "Other", sep = "-"), Taxa2),
             Taxa2 = ifelse(Taxa2 == "Unassigned-Eukaryote-Other", "Unassigned-Eukaryote", Taxa2),
             Taxa2 = ifelse(grepl("-Other-Other", Taxa2), Taxa, Taxa2)) %>% 
      mutate(Broad_Taxa = Taxa) %>%
      mutate(Broad_Taxa = ifelse(Broad_Taxa %in% non_ciliate, "Alveolata", Broad_Taxa),
             Broad_Taxa = ifelse(grepl("Rhizaria", Broad_Taxa), "Rhizaria", Broad_Taxa),
             Broad_Taxa = ifelse(grepl("Stramenopiles", Broad_Taxa), "Stramenopiles", Broad_Taxa),
             Broad_Taxa = ifelse(grepl("Archaeplastida", Broad_Taxa), "Archaeplastida", Broad_Taxa),
             Broad_Taxa = ifelse(grepl("Hacrobia", Broad_Taxa), "Hacrobia", Broad_Taxa),
             Broad_Taxa = ifelse(grepl("Opisthokonta", Broad_Taxa), "Opisthokonta", Broad_Taxa)) %>% 
  data.frame
  return(df2)
    }
# Apply to ASV table
gr_counts_avg_wtax2 <- expand_taxa2(gr_counts_avg_wtax)
## `summarise()` ungrouping output (override with `.groups` argument)
unique(gr_counts_avg_wtax2$Broad_Taxa)
##  [1] "Rhizaria"             "Stramenopiles"        "Opisthokonta"        
##  [4] "Alveolata-Ciliates"   "Alveolata"            "Unassigned-Eukaryote"
##  [7] "Hacrobia"             "Archaeplastida"       "Excavata"            
## [10] "Amoebozoa"
## Check Ciliate output
# unique(gr_counts_avg_wtax2 %>% filter(Taxa == "Alveolata-Ciliates") %>% select(Taxa2) %>% distinct())
## Check non-ciliate output
# View(unique(gr_counts_avg_wtax2 %>% filter(!(Taxa == "Alveolata-Ciliates")) %>% select(Taxa, Taxa2) %>% distinct()))

7.2 Include ASV distribution

# Add categories & set up for plotting function
gr_tax_res <- gr_counts_avg_wtax2 %>% 
    left_join(asv_key_final) %>% 
    unite(SAMPLE, LocationName, Sampletype, SAMPLEID, sep = "-", remove = FALSE) %>% 
    data.frame
## Joining, by = "Feature.ID"
#
resident <- c("ASV appears among vent/plume (no BSW)", "ASV appears among vent (no BSW or plume)", "Shared between in situ vent and grazing exp")
cosmo <- c("ASV appears in BSW and throughout vent/plume", "ASV present in all samples")
gr_tax_res$RES_COSMO = "Other"
gr_tax_res$RES_COSMO[gr_tax_res$category_final %in% resident] = "Resident"
gr_tax_res$RES_COSMO[gr_tax_res$category_final %in% cosmo] = "Cosmopolitan"

7.3 Plot function

sample_order_all<-c("Shallow seawater-in situ-sterivex","Deep seawater-in situ-sterivex","Mt Edwards Plume-in situ-sterivex","Mt Edwards Plume-Grazing-T0","Mt Edwards Plume-Grazing-T24","Mt Edwards Plume-Grazing-T36","Mt Edwards Vent-in situ-SUPR","Mt Edwards Vent-Grazing-T0","Mt Edwards Vent-Grazing-T36","Venti Latte Vent-in situ-SUPR","Venti Latte Vent-Grazing-T0","Venti Latte Vent-Grazing-T36","Candelabra Vent-in situ-SUPR","Candelabra Vent-Grazing-T24","SirVentsAlot Vent-in situ-SUPR","SirVentsAlot Vent-Grazing-T24")
sample_name_all<-c("Shallow seawater","Deep seawater","Mt Edwards Plume","Mt Edwards Plume T0","Mt Edwards Plume T23","Mt Edwards Plume T35","Mt Edwards Vent","Mt Edwards Vent T0","Mt Edwards Vent  T36","Venti Latte Vent","Venti Latte Vent T0","Venti Latte Vent T29","Candelabra Vent","Candelabra Vent T26","SirVentsAlot Vent","SirVentsAlot Vent T24")

prepdf_tax_dist <- function(df){ 
    df2 <- df %>% 
#     filter(category_final %in% category) %>% 
    # average across replicates
    group_by(Feature.ID, RES_COSMO, SAMPLE, SAMPLEID, Sampletype, LocationName, Broad_Taxa, Taxon_updated, Taxa, Taxa2) %>%
    summarise(COUNT_AVG2 = mean(COUNT_AVG)) %>%
    ungroup() %>% 
    # sum by like taxa
    group_by(RES_COSMO, SAMPLE, SAMPLEID, Sampletype, LocationName, Broad_Taxa, Taxa, Taxa2) %>% 
    summarise(SUM = sum(COUNT_AVG2)) %>% 
    data.frame
    df2$SAMPLENAME<-factor(df2$SAMPLE, levels = sample_order_all, labels = sample_name_all)
    df2$SAMPLEID_ORDER <- factor(df2$SAMPLEID, levels = c("sterivex", "SUPR", "T0", "T24", "T36"))
    df2$LOCATION_ORDER <- factor(df2$LocationName, levels=c("Shallow seawater", "Deep seawater", "Mt Edwards Plume", "Mt Edwards Vent", "Venti Latte Vent", "Candelabra Vent", "SirVentsAlot Vent"))
    return(df2)
}
gr_tax_res_toplot <- prepdf_tax_dist(gr_tax_res)
# head(gr_tax_res_toplot[1:2,])
# View(gr_tax_res_toplot %>% select(Taxa, Taxa2) %>% distinct())
gr_tax_res_richness <- gr_tax_res %>% 
    # Average across replicates
    group_by(Feature.ID, RES_COSMO, SAMPLE, SAMPLEID, Sampletype, LocationName, Broad_Taxa, Taxon_updated, Taxa, Taxa2) %>%
    summarise(COUNT_AVG2 = mean(COUNT_AVG)) %>%
    ungroup() %>% 
    # Get richness for each taxonomic group
    group_by(RES_COSMO, Broad_Taxa, Taxa, Taxa2) %>% 
    summarise(RICHNESS = n_distinct(Feature.ID)) %>% 
    data.frame
## `summarise()` regrouping output by 'Feature.ID', 'RES_COSMO', 'SAMPLE', 'SAMPLEID', 'Sampletype', 'LocationName', 'Broad_Taxa', 'Taxon_updated', 'Taxa' (override with `.groups` argument)
## `summarise()` regrouping output by 'RES_COSMO', 'Broad_Taxa', 'Taxa' (override with `.groups` argument)
# Factor
ciliate_order <- c("Heterotrichea","Oligohymenophorea","Litostomatea","Karyorelictea","Nassophorea","Phyllopharyngea","Plagiopylea","CONThreeP","Spirotrichea-Choreotrichida","Spirotrichea-Euplotia","Spirotrichea-Hypotrichia","Spirotrichea-Strombidiida","Spirotrichea-Tintinnida","Spirotrichea-Other","Alveolata-Ciliates")
gr_tax_res_toplot$CILIATE_ORDER <- factor(gr_tax_res_toplot$Taxa2, levels = ciliate_order)
CILIATE_COLOR <- c("#ffffcc","#d9f0a3","#addd8e","#78c679","#31a354","#006837","#fde0dd","#fcc5c0","#fa9fb5","#f768a1","#dd3497","#ae017e","#7a0177","#49006a","#bdbdbd")

names(CILIATE_COLOR)<-ciliate_order
# head(gr_tax_res_toplot)
ciliate_plot <- gr_tax_res_toplot %>% 
    filter(Taxa %in% "Alveolata-Ciliates") %>% 
    filter(!(RES_COSMO == "Other")) %>% 
    ggplot(aes(x = SAMPLENAME, y = SUM, fill = CILIATE_ORDER)) +
    geom_bar(stat = "identity", position = "fill", color = "black") +
    scale_fill_manual(values = CILIATE_COLOR) +
    scale_y_continuous(expand = c(0,0))+
    theme(legend.position = "right",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", face="bold", size = 12),
          axis.text.y = element_text(color="black", face="bold", size = 12),
          strip.text.x = element_blank(),
          strip.background = element_blank(),
          legend.title = element_blank()) +
    labs(x="", y="Relative abundance")+
    facet_grid(RES_COSMO~LOCATION_ORDER, space = "free", scales = "free")
ciliate_plot

7.4 Plot non-ciliate distribution

Repeat high-resolution look at protistan biogeography, but without ciliate groups.

# Heterotrophs - non-ciliate
tax_order <- c("Prorocentrales","Gymnodiniales","Peridiniales","Gonyaulacales","Torodiniales","Suessiales","Noctilucales","Alveolata-Dinoflagellates","Dino-Group-I","Dino-Group-II","Dino-Group-IV","Dino-Group-III","Dino-Group-V","Alveolata-Syndiniales","Alveolata-Other","Rhizaria-Cercozoa","Rhizaria-Radiolaria-Acantharea","Rhizaria-Radiolaria-Polycystinea","Rhizaria-Radiolaria","Rhizaria-Other","Stramenopiles-MAST","Bacillariophyta","Dictyochophyceae","Chrysophyceae","MOCH","Bolidophyceae","Pelagophyceae","Synurophyceae","Stramenopiles-Ochrophyta","Stramenopiles-Other","Hacrobia-Cryptophyta","Hacrobia-Haptophyta","Centroheliozoa","Telonemia","Picozoa","Katablepharidophyta","Hacrobia-Other","Amoebozoa","Discoba","Metamonada","Archaeplastida-Chlorophyta","Archaeplastida-Other","Unassigned-Eukaryote","Basidiomycota","Chytridiomycota","Opisthokonta-Fungi","Ascomycota","Microsporidiomycota","Cnidaria","Craniata","Mollusca","Nemertea","Opisthokonta-Metazoa","Annelida","Arthropoda","Rotifera","Urochordata","Nematoda","Ctenophora","Echinodermata","Platyhelminthes","Porifera","Gastrotricha","Opisthokonta-Other","Choanoflagellatea","Ichthyosporea","Filasterea")
color_order <- c("#7f0000","#b30000","#d7301f","#ef6548","#fc8d59","#fdbb84","#fdd49e","#fee8c8","#deebf7","#9ecae1","#6baed6","#2171b5","#08519c","#08306b","#ffffff","#800026","#bd0026","#e31a1c","#fc4e2a","#fd8d3c","#feb24c","#fed976","#ffeda0","#ffffcc","#fec44f","#fe9929","#ec7014","#cc4c02","#993404","#662506","#e5f5e0","#a1d99b","#74c476","#41ab5d","#238b45","#006d2c","#00441b","#edf8b1","#7fcdbb","#1d91c0","#4292c6","#08519c","#f0f0f0","#4d004b","#810f7c","#88419d","#8c6bb1","#8c96c6","#9ebcda","#bfd3e6","#e0ecf4","#f7fcfd","#67001f","#980043","#ce1256","#e7298a","#df65b0","#c994c7","#d4b9da","#e7e1ef","#014636","#016c59","#02818a","#3690c0","#67a9cf","#a6bddb","#d0d1e6")
gr_tax_res_toplot$TAXORDER <- factor(gr_tax_res_toplot$Taxa2, levels = tax_order)
names(color_order) <- tax_order

hets_plot <- ggplot(gr_tax_res_toplot, aes(x = SAMPLENAME, y = SUM, fill = TAXORDER)) +
    geom_bar(stat = "identity", position = "fill", color = "black") +
    scale_fill_manual(values = color_order) +
    scale_y_continuous(expand = c(0,0))+
    theme(legend.position = "right",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", face="bold", size = 12),
          axis.text.y = element_text(color="black", face="bold", size = 12),
          strip.text.x = element_blank(),
          strip.background = element_blank(),
          legend.title = element_blank()) +
    labs(x="", y="Relative abundance")+
    # guides(fill = guide_legend(ncol = 1)) +
    facet_grid(RES_COSMO~LOCATION_ORDER, space = "free", scales = "free")
metaz <- c("Opisthokonta-Fungi", "Opisthokonta-Other", "Opisthokonta-Metazoa")
# hets_plot %+% subset(gr_tax_res_toplot, !(RES_COSMO == "Other") & (!(Taxa %in% metaz)) & (!(Taxa == "Alveolata-Ciliates")))
# svg("Supplementary-metazoa-plot.svg", h = 8, w = 10)
hets_plot %+% subset(gr_tax_res_toplot, !(RES_COSMO == "Other") & (Taxa %in% metaz))

# dev.off()

7.5 Generate full panel plot

# options(repr.plot.width = 18, repr.plot.height = 8)

# svg("tax-res-withdist-GR.svg", w = 18, h = 8)
# ciliate_key <- get_legend(ciliate_plot)
# other_key <- get_legend(hets_plot %+% subset(gr_tax_res_toplot, !(RES_COSMO == "Other") & (!(Taxa %in% metaz)) & (!(Taxa == "Alveolata-Ciliates"))))
# plot_grid(ciliate_plot + theme(legend.position = "none"),
#           ciliate_key,
#          hets_plot %+% subset(gr_tax_res_toplot, !(RES_COSMO == "Other") & (!(Taxa %in% metaz)) & (!(Taxa == "Alveolata-Ciliates"))) + theme(legend.position = "none"),
#          other_key,
#          ncol = 4, labels = c("a","", "b", ""), rel_widths = c(1, 0.2, 1, 0.2)) 
# dev.off()

7.6 Tile plot with all taxonomic groups

head(gr_tax_res_toplot)
##      RES_COSMO                      SAMPLE SAMPLEID Sampletype    LocationName
## 1 Cosmopolitan Candelabra Vent-Grazing-T24      T24    Grazing Candelabra Vent
## 2 Cosmopolitan Candelabra Vent-Grazing-T24      T24    Grazing Candelabra Vent
## 3 Cosmopolitan Candelabra Vent-Grazing-T24      T24    Grazing Candelabra Vent
## 4 Cosmopolitan Candelabra Vent-Grazing-T24      T24    Grazing Candelabra Vent
## 5 Cosmopolitan Candelabra Vent-Grazing-T24      T24    Grazing Candelabra Vent
## 6 Cosmopolitan Candelabra Vent-Grazing-T24      T24    Grazing Candelabra Vent
##   Broad_Taxa                      Taxa                     Taxa2  SUM
## 1  Alveolata Alveolata-Dinoflagellates Alveolata-Dinoflagellates 1880
## 2  Alveolata Alveolata-Dinoflagellates             Gymnodiniales  294
## 3  Alveolata Alveolata-Dinoflagellates              Peridiniales   36
## 4  Alveolata Alveolata-Dinoflagellates            Prorocentrales   37
## 5  Alveolata     Alveolata-Syndiniales              Dino-Group-I  315
## 6  Alveolata     Alveolata-Syndiniales             Dino-Group-II  199
##            SAMPLENAME SAMPLEID_ORDER  LOCATION_ORDER CILIATE_ORDER
## 1 Candelabra Vent T26            T24 Candelabra Vent          <NA>
## 2 Candelabra Vent T26            T24 Candelabra Vent          <NA>
## 3 Candelabra Vent T26            T24 Candelabra Vent          <NA>
## 4 Candelabra Vent T26            T24 Candelabra Vent          <NA>
## 5 Candelabra Vent T26            T24 Candelabra Vent          <NA>
## 6 Candelabra Vent T26            T24 Candelabra Vent          <NA>
##                    TAXORDER
## 1 Alveolata-Dinoflagellates
## 2             Gymnodiniales
## 3              Peridiniales
## 4            Prorocentrales
## 5              Dino-Group-I
## 6             Dino-Group-II
# View(unique(gr_tax_res_toplot$Broad_Taxa))
gr_relAbun_toheat <- gr_tax_res_toplot %>% 
    # filter(Taxa %in% "Alveolata-Ciliates") %>% 
    filter(!(RES_COSMO == "Other")) %>% 
    #Determine relative abundance within samples
    group_by(Broad_Taxa, SAMPLENAME) %>% 
    mutate(SUMTOTAL = sum(SUM),
           RelAbun = 100*(SUM/SUMTOTAL)) %>% 
    data.frame

# Re-factor
tax2_order_all <- c("Heterotrichea","Oligohymenophorea","Litostomatea","Karyorelictea","Nassophorea","Phyllopharyngea","Plagiopylea","CONThreeP","Spirotrichea-Choreotrichida","Spirotrichea-Euplotia","Spirotrichea-Hypotrichia","Spirotrichea-Strombidiida","Spirotrichea-Tintinnida","Spirotrichea-Other","Alveolata-Ciliates","Prorocentrales","Gymnodiniales","Peridiniales","Gonyaulacales","Torodiniales","Suessiales","Noctilucales","Alveolata-Dinoflagellates","Dino-Group-I","Dino-Group-II","Dino-Group-IV","Dino-Group-III","Dino-Group-V","Alveolata-Syndiniales","Alveolata-Other","Rhizaria-Cercozoa","Rhizaria-Radiolaria-Acantharea","Rhizaria-Radiolaria-Polycystinea","Rhizaria-Radiolaria","Rhizaria-Other","Stramenopiles-MAST","Bacillariophyta","Dictyochophyceae","Chrysophyceae","MOCH","Bolidophyceae","Pelagophyceae","Synurophyceae","Stramenopiles-Ochrophyta","Stramenopiles-Other","Hacrobia-Cryptophyta","Hacrobia-Haptophyta","Centroheliozoa","Telonemia","Picozoa","Katablepharidophyta","Hacrobia-Other","Amoebozoa","Discoba","Metamonada","Archaeplastida-Chlorophyta","Archaeplastida-Other","Unassigned-Eukaryote","Basidiomycota","Chytridiomycota","Opisthokonta-Fungi","Ascomycota","Microsporidiomycota","Cnidaria","Craniata","Mollusca","Nemertea","Opisthokonta-Metazoa","Annelida","Arthropoda","Rotifera","Urochordata","Nematoda","Ctenophora","Echinodermata","Platyhelminthes","Porifera","Gastrotricha","Opisthokonta-Other","Choanoflagellatea","Ichthyosporea","Filasterea")
gr_relAbun_toheat$TAXA2_ORDER <- factor(gr_relAbun_toheat$Taxa2, levels = tax2_order_all)

gr_relAbun_toheat$Broad_ORDER <- factor(gr_relAbun_toheat$Broad_Taxa, levels = c("Alveolata-Ciliates", "Alveolata", "Rhizaria", "Stramenopiles", "Hacrobia", "Amoebozoa", "Excavata", "Archaeplastida", "Opisthokonta", "Unassigned-Eukaryote"))
broad_color <- c("#ae017e","#bd0026","#ec7014","#238b45","#016c59","#08519c","#54278f","#810f7c")
    # Make geom tile plot
tile_tax <- ggplot(gr_relAbun_toheat, aes(x = SAMPLENAME, alpha = RelAbun, fill = Broad_ORDER, y = Taxa2)) +
    geom_tile(color = "white") +
    scale_fill_manual(values = broad_color) +
    theme(legend.position = "right",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", size = 8),
          axis.text.y = element_text(color="black", size = 8),
          strip.background = element_blank(),
          legend.title = element_blank()) +
    labs(x="", y="")+
    facet_grid(Broad_ORDER~RES_COSMO, space = "free", scales = "free")

metaz <- c("Opisthokonta-Fungi", "Opisthokonta-Other", "Opisthokonta-Metazoa")


tile_tax %+% subset(gr_relAbun_toheat, !(Taxa %in% metaz | Taxa == "Unassigned-Eukaryote"))

gr_tax_res_richness$Broad_ORDER <- factor(gr_tax_res_richness$Broad_Taxa, levels = c("Alveolata-Ciliates", "Alveolata", "Rhizaria", "Stramenopiles", "Hacrobia", "Amoebozoa", "Excavata", "Archaeplastida", "Opisthokonta", "Unassigned-Eukaryote"))
# head(gr_tax_res_richness)
# bubble plot richness
richness_df <- gr_tax_res_richness %>% 
  filter(!(RES_COSMO == "Other")) %>% 
  filter(!(Taxa %in% metaz | Taxa == "Unassigned-Eukaryote")) %>% 
  data.frame
#
richness_plot <- ggplot(richness_df, aes(x = RES_COSMO, y = Taxa2)) +
    geom_point(aes(size = RICHNESS)) +
    scale_size_continuous(range = c(0.2, 3)) +
    facet_grid(Broad_ORDER~RES_COSMO, space = "free", scales = "free") +
    theme(legend.position = "right",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle = 90, size = 8),
          axis.text.y = element_text(size = 8),
          strip.text = element_blank(),
          strip.background = element_blank(),
          legend.title = element_blank()) +
    labs(x="", y="")
tile <- get_legend(tile_tax %+% subset(gr_relAbun_toheat, !(Taxa %in% metaz | Taxa == "Unassigned-Eukaryote")))
rich <- get_legend(richness_plot)

# svg("tile_bubble_plot.svg", w = 22, h = 10)
plot_grid(tile_tax %+% subset(gr_relAbun_toheat, !(Taxa %in% metaz | Taxa == "Unassigned-Eukaryote")) + theme(legend.position = "none"),
          richness_plot + theme(legend.position = "none"),
          tile, rich, ncol = 4,
          axis = "b")

# dev.off()

7.7 Supplementary plots

tax_order <- c("Prorocentrales","Gymnodiniales","Peridiniales","Gonyaulacales","Torodiniales","Suessiales","Noctilucales","Alveolata-Dinoflagellates","Dino-Group-I","Dino-Group-II","Dino-Group-IV","Dino-Group-III","Dino-Group-V","Alveolata-Syndiniales","Alveolata-Other","Rhizaria-Cercozoa","Rhizaria-Radiolaria-Acantharea","Rhizaria-Radiolaria-Polycystinea","Rhizaria-Radiolaria","Rhizaria-Other","Stramenopiles-MAST","Bacillariophyta","Dictyochophyceae","Chrysophyceae","MOCH","Bolidophyceae","Pelagophyceae","Synurophyceae","Stramenopiles-Ochrophyta","Stramenopiles-Other","Hacrobia-Cryptophyta","Hacrobia-Haptophyta","Centroheliozoa","Telonemia","Picozoa","Katablepharidophyta","Hacrobia-Other","Amoebozoa","Discoba","Metamonada","Archaeplastida-Chlorophyta","Archaeplastida-Other","Unassigned-Eukaryote","Basidiomycota","Chytridiomycota","Opisthokonta-Fungi","Ascomycota","Microsporidiomycota","Cnidaria","Craniata","Mollusca","Nemertea","Opisthokonta-Metazoa","Annelida","Arthropoda","Rotifera","Urochordata","Nematoda","Ctenophora","Echinodermata","Platyhelminthes","Porifera","Gastrotricha","Opisthokonta-Other","Choanoflagellatea","Ichthyosporea","Filasterea")
color_order <- c("#7f0000","#b30000","#d7301f","#ef6548","#fc8d59","#fdbb84","#fdd49e","#fee8c8","#deebf7","#9ecae1","#6baed6","#2171b5","#08519c","#08306b","#ffffff","#800026","#bd0026","#e31a1c","#fc4e2a","#fd8d3c","#feb24c","#fed976","#ffeda0","#ffffcc","#fec44f","#fe9929","#ec7014","#cc4c02","#993404","#662506","#e5f5e0","#a1d99b","#74c476","#41ab5d","#238b45","#006d2c","#00441b","#edf8b1","#7fcdbb","#1d91c0","#4292c6","#08519c","#f0f0f0","#4d004b","#810f7c","#88419d","#8c6bb1","#8c96c6","#9ebcda","#bfd3e6","#e0ecf4","#f7fcfd","#67001f","#980043","#ce1256","#e7298a","#df65b0","#c994c7","#d4b9da","#e7e1ef","#014636","#016c59","#02818a","#3690c0","#67a9cf","#a6bddb","#d0d1e6")
metaz <- c("Opisthokonta-Metazoa", "Opisthokonta-Fungi", "Opisthokonta-Other")

supp_plot <- ggplot(gr_tax_res_toplot, aes(x = SAMPLENAME, y = SUM, fill = Taxa2)) +
    geom_bar(stat = "identity", position = "fill", color = "black") +
    scale_fill_manual(values = color_order) +
    scale_y_continuous(expand = c(0,0))+
    theme(legend.position = "right",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", face="bold", size = 12),
          axis.text.y = element_text(color="black", face="bold", size = 12),
          strip.text.x = element_blank(),
          strip.background = element_blank(),
          legend.title = element_blank()) +
    labs(x="", y="Relative abundance")+
    facet_grid(RES_COSMO~LOCATION_ORDER, space = "free", scales = "free")
supp_plot %+% subset(gr_tax_res_toplot, !(RES_COSMO == "Other") & Taxa %in% metaz)

# Look at whole taxonomic distribution
head(gr_tax_res)
##                         Feature.ID                            SAMPLE SAMPLEID
## 1 0009645516609bda2246e1955ff9ec1d Shallow seawater-in situ-sterivex sterivex
## 2 0030ad8ce44f257c42daf3673bf92197 Shallow seawater-in situ-sterivex sterivex
## 3 0030ad8ce44f257c42daf3673bf92197     Venti Latte Vent-in situ-SUPR     SUPR
## 4 0030ad8ce44f257c42daf3673bf92197      Candelabra Vent-in situ-SUPR     SUPR
## 5 0030ad8ce44f257c42daf3673bf92197     SirVentsAlot Vent-Grazing-T24      T24
## 6 0038478be7fb4f097ce93a5e9341af2a    Deep seawater-in situ-sterivex sterivex
##   Sampletype LOCATION_SPECIFIC      LocationName
## 1    in situ            BSW081  Shallow seawater
## 2    in situ            BSW081  Shallow seawater
## 3    in situ           Vent040  Venti Latte Vent
## 4    in situ           Vent088   Candelabra Vent
## 5    Grazing           Vent110 SirVentsAlot Vent
## 6    in situ            BSW056     Deep seawater
##                                                                                                                          Taxon_updated
## 1 Eukaryota;Rhizaria;Radiolaria;Acantharea;Acantharea-Group-II;Acantharea-Group-II_X;Acantharea-Group-II_XX;Acantharea-Group-II_XX_sp.
## 2                                                  Eukaryota;Stramenopiles;Opalozoa;MAST-3;MAST-3J;MAST-3J_X;MAST-3J_XX;MAST-3J_XX_sp.
## 3                                                  Eukaryota;Stramenopiles;Opalozoa;MAST-3;MAST-3J;MAST-3J_X;MAST-3J_XX;MAST-3J_XX_sp.
## 4                                                  Eukaryota;Stramenopiles;Opalozoa;MAST-3;MAST-3J;MAST-3J_X;MAST-3J_XX;MAST-3J_XX_sp.
## 5                                                  Eukaryota;Stramenopiles;Opalozoa;MAST-3;MAST-3J;MAST-3J_X;MAST-3J_XX;MAST-3J_XX_sp.
## 6                                                Eukaryota;Opisthokonta;Metazoa;Cnidaria;Cnidaria_X;Hydrozoa;Aglaura;Aglaura_hemistoma
##     Kingdom    Supergroup   Division      Class               Order
## 1 Eukaryota      Rhizaria Radiolaria Acantharea Acantharea-Group-II
## 2 Eukaryota Stramenopiles   Opalozoa     MAST-3             MAST-3J
## 3 Eukaryota Stramenopiles   Opalozoa     MAST-3             MAST-3J
## 4 Eukaryota Stramenopiles   Opalozoa     MAST-3             MAST-3J
## 5 Eukaryota Stramenopiles   Opalozoa     MAST-3             MAST-3J
## 6 Eukaryota  Opisthokonta    Metazoa   Cnidaria          Cnidaria_X
##                  Family                  Genus                    Species
## 1 Acantharea-Group-II_X Acantharea-Group-II_XX Acantharea-Group-II_XX_sp.
## 2             MAST-3J_X             MAST-3J_XX             MAST-3J_XX_sp.
## 3             MAST-3J_X             MAST-3J_XX             MAST-3J_XX_sp.
## 4             MAST-3J_X             MAST-3J_XX             MAST-3J_XX_sp.
## 5             MAST-3J_X             MAST-3J_XX             MAST-3J_XX_sp.
## 6              Hydrozoa                Aglaura          Aglaura_hemistoma
##                   Taxa COUNT_AVG               Taxa2    Broad_Taxa
## 1  Rhizaria-Radiolaria        80 Acantharea-Group-II      Rhizaria
## 2   Stramenopiles-MAST        36              MAST-3 Stramenopiles
## 3   Stramenopiles-MAST        12              MAST-3 Stramenopiles
## 4   Stramenopiles-MAST        34              MAST-3 Stramenopiles
## 5   Stramenopiles-MAST        15              MAST-3 Stramenopiles
## 6 Opisthokonta-Metazoa        21            Cnidaria  Opisthokonta
##                                 category_final    RES_COSMO
## 1                             Unique to sample        Other
## 2 ASV appears in BSW and throughout vent/plume Cosmopolitan
## 3 ASV appears in BSW and throughout vent/plume Cosmopolitan
## 4 ASV appears in BSW and throughout vent/plume Cosmopolitan
## 5 ASV appears in BSW and throughout vent/plume Cosmopolitan
## 6 ASV appears in BSW and throughout vent/plume Cosmopolitan
sirvents <- gr_tax_res %>% 
  filter(LocationName == "SirVentsAlot Vent") %>% 
    # sum by like taxa
  group_by(LOCATION_SPECIFIC, RES_COSMO, SAMPLE, SAMPLEID, Sampletype, LocationName, Broad_Taxa, Taxa, Taxa2) %>% 
  summarise(SUM = sum(COUNT_AVG)) %>% 
  filter(!(RES_COSMO == "Other")) %>% 
  #Determine relative abundance within samples
  group_by(Broad_Taxa, LOCATION_SPECIFIC, SAMPLE) %>% 
  mutate(SUMTOTAL = sum(SUM),
         RelAbun = 100*(SUM/SUMTOTAL)) %>% 
  unite(SAMPLE_REPS, LOCATION_SPECIFIC, SAMPLE, sep = " ", remove = FALSE) %>% 
  data.frame
## `summarise()` regrouping output by 'LOCATION_SPECIFIC', 'RES_COSMO', 'SAMPLE', 'SAMPLEID', 'Sampletype', 'LocationName', 'Broad_Taxa', 'Taxa' (override with `.groups` argument)
head(sirvents)
##                              SAMPLE_REPS LOCATION_SPECIFIC    RES_COSMO
## 1 Vent105 SirVentsAlot Vent-in situ-SUPR           Vent105 Cosmopolitan
## 2 Vent105 SirVentsAlot Vent-in situ-SUPR           Vent105 Cosmopolitan
## 3 Vent105 SirVentsAlot Vent-in situ-SUPR           Vent105 Cosmopolitan
## 4 Vent105 SirVentsAlot Vent-in situ-SUPR           Vent105 Cosmopolitan
## 5 Vent105 SirVentsAlot Vent-in situ-SUPR           Vent105 Cosmopolitan
## 6 Vent105 SirVentsAlot Vent-in situ-SUPR           Vent105 Cosmopolitan
##                           SAMPLE SAMPLEID Sampletype      LocationName
## 1 SirVentsAlot Vent-in situ-SUPR     SUPR    in situ SirVentsAlot Vent
## 2 SirVentsAlot Vent-in situ-SUPR     SUPR    in situ SirVentsAlot Vent
## 3 SirVentsAlot Vent-in situ-SUPR     SUPR    in situ SirVentsAlot Vent
## 4 SirVentsAlot Vent-in situ-SUPR     SUPR    in situ SirVentsAlot Vent
## 5 SirVentsAlot Vent-in situ-SUPR     SUPR    in situ SirVentsAlot Vent
## 6 SirVentsAlot Vent-in situ-SUPR     SUPR    in situ SirVentsAlot Vent
##   Broad_Taxa                      Taxa                     Taxa2  SUM SUMTOTAL
## 1  Alveolata Alveolata-Dinoflagellates Alveolata-Dinoflagellates 2882     6857
## 2  Alveolata Alveolata-Dinoflagellates             Gymnodiniales 1686     6857
## 3  Alveolata Alveolata-Dinoflagellates              Peridiniales  171     6857
## 4  Alveolata Alveolata-Dinoflagellates            Prorocentrales   99     6857
## 5  Alveolata Alveolata-Dinoflagellates              Torodiniales  134     6857
## 6  Alveolata     Alveolata-Syndiniales              Dino-Group-I  354     6857
##     RelAbun
## 1 42.030042
## 2 24.588012
## 3  2.493802
## 4  1.443780
## 5  1.954207
## 6  5.162608
# Factoring
sirvents$SAMPLENAME<-factor(sirvents$SAMPLE, levels = sample_order_all, labels = sample_name_all)
sirvents$SAMPLEID_ORDER <- factor(sirvents$SAMPLEID, levels = c("sterivex", "SUPR", "T0", "T24", "T36"))
# Re-factor
tax2_order_all <- c("Heterotrichea","Oligohymenophorea","Litostomatea","Karyorelictea","Nassophorea","Phyllopharyngea","Plagiopylea","CONThreeP","Spirotrichea-Choreotrichida","Spirotrichea-Euplotia","Spirotrichea-Hypotrichia","Spirotrichea-Strombidiida","Spirotrichea-Tintinnida","Spirotrichea-Other","Alveolata-Ciliates","Prorocentrales","Gymnodiniales","Peridiniales","Gonyaulacales","Torodiniales","Suessiales","Noctilucales","Alveolata-Dinoflagellates","Dino-Group-I","Dino-Group-II","Dino-Group-IV","Dino-Group-III","Dino-Group-V","Alveolata-Syndiniales","Alveolata-Other","Rhizaria-Cercozoa","Rhizaria-Radiolaria-Acantharea","Rhizaria-Radiolaria-Polycystinea","Rhizaria-Radiolaria","Rhizaria-Other","Stramenopiles-MAST","Bacillariophyta","Dictyochophyceae","Chrysophyceae","MOCH","Bolidophyceae","Pelagophyceae","Synurophyceae","Stramenopiles-Ochrophyta","Stramenopiles-Other","Hacrobia-Cryptophyta","Hacrobia-Haptophyta","Centroheliozoa","Telonemia","Picozoa","Katablepharidophyta","Hacrobia-Other","Amoebozoa","Discoba","Metamonada","Archaeplastida-Chlorophyta","Archaeplastida-Other","Unassigned-Eukaryote","Basidiomycota","Chytridiomycota","Opisthokonta-Fungi","Ascomycota","Microsporidiomycota","Cnidaria","Craniata","Mollusca","Nemertea","Opisthokonta-Metazoa","Annelida","Arthropoda","Rotifera","Urochordata","Nematoda","Ctenophora","Echinodermata","Platyhelminthes","Porifera","Gastrotricha","Opisthokonta-Other","Choanoflagellatea","Ichthyosporea","Filasterea")
sirvents$TAXA2_ORDER <- factor(sirvents$Taxa2, levels = tax2_order_all)

sirvents$Broad_ORDER <- factor(sirvents$Broad_Taxa, levels = c("Alveolata-Ciliates", "Alveolata", "Rhizaria", "Stramenopiles", "Hacrobia", "Amoebozoa", "Excavata", "Archaeplastida", "Opisthokonta", "Unassigned-Eukaryote"))
broad_color <- c("#ae017e","#bd0026","#ec7014","#238b45","#016c59","#08519c","#54278f","#810f7c")
sirvents_plot <- ggplot(sirvents, aes(x = SAMPLE_REPS, alpha = RelAbun, fill = Broad_ORDER, y = Taxa2)) +
    geom_tile(color = "white") +
    scale_fill_manual(values = broad_color) +
    theme(legend.position = "right",
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(),
          panel.border = element_blank(),
          panel.background = element_blank(),
          axis.text.x = element_text(angle=90, hjust=1, vjust=0.5, color = "black", size = 8),
          axis.text.y = element_text(color="black", size = 8),
          strip.background = element_blank(),
          legend.title = element_blank()) +
    labs(x="", y="")+
    facet_grid(Broad_ORDER~RES_COSMO, space = "free", scales = "free")

metaz <- c("Opisthokonta-Fungi", "Opisthokonta-Other", "Opisthokonta-Metazoa")


sirvents_plot %+% subset(sirvents, !(Taxa == "Unassigned-Eukaryote"))

7.8 ID most abundant ASVs

# What are the most abundant ASVs in each of the taxa2 categories?
gr_topASV_taxa2 <- gr_tax_res %>% 
    select(Feature.ID, RES_COSMO, Taxa, Taxa2, Taxon_updated, COUNT_AVG) %>%
    group_by(Feature.ID, RES_COSMO, Taxa, Taxa2, Taxon_updated) %>% 
    summarise(Total = sum(COUNT_AVG)) %>% 
    ungroup() %>% 
    group_by(Taxa, Taxa2) %>%
    arrange(Taxa2, desc(Total)) %>% 
    top_n(10, Total) %>% 
    data.frame
# gr_topASV_taxa2
write_delim(gr_topASV_taxa2, path = "supptable-topASVs-taxa2.txt", delim = "\t")
# save(gr_tax_res, file = "GR-18S-ASV-list.RData")

8 Correlation analysis: 18S-16S

Use these ASVs downstream to explore hypotheses with correlation results

  • H1: What are the more abundant cosmo and resident ASVs among ciliates - esp named grouped above? Hypothesis: the composition of 16S ASVs that significantly correlated with the 18S ASVs is different between the cosmo and resident 18S ASVs. This would suggest that prey composition is attracting these taxa.
  • H2: What 18S ASVs are significantly correlated to the 16S ASVs considered to be “vent-specific”? Hypothesis: no clear pattern will emerge (with respect to taxonomic group or 18S prevalent among vent site - meaning, resident or cosmo). This would mean that protists are found throughout the deep sea, but are specifically enriched at the vent site. So both cosmopolitan and resident protists may be associated with the 16S vent-specific signature. This is similar to Murdock and Juniper, where there was not a consistent pattern in those that were significant correlated to vent-endemic 16S.

8.1 Prepare 16S and 18S data for correlation analysis:

Format input 18S and 16S data, save for correlation analysis.

# load 18S data
load("data-input/GR-countinfo-withASVdistribution.RData", verbose=T)
## Loading objects:
##   gr_sorted
##   asv_key_final
# head(gr_sorted)
# load 16S data
bac_wtax <- read.delim("bac-wtaxassigned-20-08-2020.txt")
# head(bac_wtax)
# Sort and filter eukaryote ASVs to consider:
sumseq <- sum(gr_sorted$COUNT_AVG)
metaz <- c("Opisthokonta-Fungi", "Opisthokonta-Other", "Opisthokonta-Metazoa")

euk_data_interact <- gr_sorted %>%
  filter(Sampletype == "in situ") %>% #select only in situ samples
  select(Feature.ID, Taxon_updated, COUNT_AVG, LocationName) %>% 
  group_by(Feature.ID, Taxon_updated, LocationName) %>% 
  summarise(COUNT_TOTAL = sum(COUNT_AVG)) %>% 
  ungroup() %>% 
  # Calculate relative abundance
  mutate(RelAbun = 100*(COUNT_TOTAL/sumseq)) %>% 
  # Remove ASVs ahead of network analysis
  group_by(Feature.ID) %>% 
  filter(RelAbun > 0.001) %>%
  mutate(sample_appear = n_distinct(LocationName)) %>% #Calculate how many times an ASV appears
  filter(sample_appear > 4) %>% #ASV must appear in at least 3 samples
  filter(COUNT_TOTAL >= 10) %>% #ASV must have at least 10 sequences
  filter(!Taxon_updated == "Unassigned-Eukaryote") %>% 
  filter(!Taxon_updated %in% metaz) %>% 
  add_column(domain = "euk") %>%
  unite(FEATURE, domain, Feature.ID, sep = "_", remove = TRUE) %>% 
  select(FEATURE, LocationName, Taxon_EUK = Taxon_updated, COUNT = COUNT_TOTAL) %>% 
  data.frame
## `summarise()` regrouping output by 'Feature.ID', 'Taxon_updated' (override with `.groups` argument)
length(unique(gr_sorted$Feature.ID)); length(unique(euk_data_interact$FEATURE))
## [1] 9028
## [1] 207
# View(euk_data_interact)
sumseq <- sum(bac_wtax$AVG_count)

bac_data_interact <- bac_wtax %>% 
  filter(!(Tax_update == "Other")) %>% #Remove "other"
  group_by(Feature.ID, Tax_update, LocationName) %>% 
  summarise(COUNT_TOTAL = sum(AVG_count)) %>% 
  ungroup() %>% 
  add_column(domain = "prok") %>% 
  # Calculate relative abundance
  mutate(RelAbun = 100*(COUNT_TOTAL/sumseq)) %>% 
  # Remove ASVs ahead of network analysis
  group_by(Feature.ID) %>% 
  filter(RelAbun > 0.001) %>%
  mutate(sample_appear = n_distinct(LocationName)) %>% #Calculate how many times an ASV appears
  filter(sample_appear > 3) %>% #ASV must appear in at least 3 samples
  filter(COUNT_TOTAL >= 10) %>% #ASV must have at least 10 sequences
  unite(FEATURE, domain, Feature.ID, sep = "_", remove = TRUE) %>% 
  select(FEATURE, LocationName, Taxon_BAC = Tax_update, COUNT = COUNT_TOTAL) %>% 
  data.frame
## `summarise()` regrouping output by 'Feature.ID', 'Tax_update' (override with `.groups` argument)
length(unique(bac_wtax$Feature.ID)); length(unique(bac_data_interact$FEATURE))
## [1] 3650
## [1] 158
# save(euk_data_interact, bac_data_interact, file = "Filtered-correlation-R-objects-20-08-2020.RData")

8.1.1 Import as phyloseq objects

euk_df <- euk_data_interact %>% 
  pivot_wider(names_from = LocationName, values_from = COUNT, values_fill = 0) %>% 
  select(order(colnames(.))) %>% 
  data.frame
# head(euk_df)

euk_asv <- as.matrix(select(euk_df, -Taxon_EUK) %>% column_to_rownames(var = "FEATURE"))
euk_tax <- as.matrix(select(euk_df, FEATURE, Taxon_EUK) %>% column_to_rownames(var = "FEATURE"))
# head(bac_asv); head(bac_tax)
row.names(euk_asv) <- row.names(euk_tax)

# Phyloseq import
euk_asv_table <- otu_table(euk_asv, taxa_are_rows = TRUE)
euk_tax_table <- tax_table(euk_tax)
euk_phy <- phyloseq(euk_asv_table, euk_tax_table)
euk_phy
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 207 taxa and 7 samples ]
## tax_table()   Taxonomy Table:    [ 207 taxa by 1 taxonomic ranks ]
bac_df <- bac_data_interact %>% 
  pivot_wider(names_from = LocationName, values_from = COUNT, values_fill = 0) %>% 
  select(order(colnames(.))) %>%
  data.frame

bac_asv <- as.matrix(select(bac_df, -Taxon_BAC) %>% column_to_rownames(var = "FEATURE"))
bac_tax <- as.matrix(select(bac_df, FEATURE, Taxon_BAC) %>% column_to_rownames(var = "FEATURE"))
# head(bac_asv); head(bac_tax)
row.names(bac_asv) <- row.names(bac_tax)

# Phyloseq import
bac_asv_table <- otu_table(bac_asv, taxa_are_rows = TRUE)
bac_tax_table <- tax_table(bac_tax)
bac_phy <- phyloseq(bac_asv_table, bac_tax_table)
bac_phy
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 158 taxa and 7 samples ]
## tax_table()   Taxonomy Table:    [ 158 taxa by 1 taxonomic ranks ]
# Repeat phyloseq objects, but make small ones for testing spieceasi
bac_df_tmp <- bac_data_interact %>% 
  pivot_wider(names_from = LocationName, values_from = COUNT, values_fill = 0) %>% 
  sample_n(50) %>% 
  select(order(colnames(.))) %>%
  data.frame

bac_asv <- as.matrix(select(bac_df_tmp, -Taxon_BAC) %>% column_to_rownames(var = "FEATURE"))
bac_tax <- as.matrix(select(bac_df_tmp, FEATURE, Taxon_BAC) %>% column_to_rownames(var = "FEATURE"))
# head(bac_asv); head(bac_tax)
row.names(bac_asv) <- row.names(bac_tax)

# Phyloseq import
bac_asv_table <- otu_table(bac_asv, taxa_are_rows = TRUE)
bac_tax_table <- tax_table(bac_tax)
bac_phy_tmp <- phyloseq(bac_asv_table, bac_tax_table)

euk_df_tmp <- euk_data_interact %>% 
  pivot_wider(names_from = LocationName, values_from = COUNT, values_fill = 0) %>% 
  sample_n(50) %>%
  select(order(colnames(.))) %>% 
  data.frame
# head(euk_df)

euk_asv <- as.matrix(select(euk_df_tmp, -Taxon_EUK) %>% column_to_rownames(var = "FEATURE"))
euk_tax <- as.matrix(select(euk_df_tmp, FEATURE, Taxon_EUK) %>% column_to_rownames(var = "FEATURE"))
# head(bac_asv); head(bac_tax)
row.names(euk_asv) <- row.names(euk_tax)

# Phyloseq import
euk_asv_table <- otu_table(euk_asv, taxa_are_rows = TRUE)
euk_tax_table <- tax_table(euk_tax)
euk_phy_tmp <- phyloseq(euk_asv_table, euk_tax_table)
euk_phy_tmp
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 50 taxa and 7 samples ]
## tax_table()   Taxonomy Table:    [ 50 taxa by 1 taxonomic ranks ]
# save phyloseq objects to run SpiecEasi in another script
save(bac_phy, euk_phy, file = "phyloseq-18s-16s.RData")
# save(bac_phy_tmp, euk_phy_tmp, file = "phyloseq-18s-16s-TMP.RData")

8.2 SpiecEasi

8.2.1 Run SpiecEasi

See slurm script right now, requires HPC.

8.2.2 Process SpiecEasi output

Transform into dataframes to look at relationship of pairs

load("data-input/gr-spieceasi-output-20-08-2020.RData", verbose = T)
## Loading objects:
##   df_adj
##   df_beta
##   se_GR
# head(df_adj)
# colnames(df_adj)
# head(euk_data_interact)
# head(bac_data_interact)
head(gr_tax_res)
##                         Feature.ID                            SAMPLE SAMPLEID
## 1 0009645516609bda2246e1955ff9ec1d Shallow seawater-in situ-sterivex sterivex
## 2 0030ad8ce44f257c42daf3673bf92197 Shallow seawater-in situ-sterivex sterivex
## 3 0030ad8ce44f257c42daf3673bf92197     Venti Latte Vent-in situ-SUPR     SUPR
## 4 0030ad8ce44f257c42daf3673bf92197      Candelabra Vent-in situ-SUPR     SUPR
## 5 0030ad8ce44f257c42daf3673bf92197     SirVentsAlot Vent-Grazing-T24      T24
## 6 0038478be7fb4f097ce93a5e9341af2a    Deep seawater-in situ-sterivex sterivex
##   Sampletype LOCATION_SPECIFIC      LocationName
## 1    in situ            BSW081  Shallow seawater
## 2    in situ            BSW081  Shallow seawater
## 3    in situ           Vent040  Venti Latte Vent
## 4    in situ           Vent088   Candelabra Vent
## 5    Grazing           Vent110 SirVentsAlot Vent
## 6    in situ            BSW056     Deep seawater
##                                                                                                                          Taxon_updated
## 1 Eukaryota;Rhizaria;Radiolaria;Acantharea;Acantharea-Group-II;Acantharea-Group-II_X;Acantharea-Group-II_XX;Acantharea-Group-II_XX_sp.
## 2                                                  Eukaryota;Stramenopiles;Opalozoa;MAST-3;MAST-3J;MAST-3J_X;MAST-3J_XX;MAST-3J_XX_sp.
## 3                                                  Eukaryota;Stramenopiles;Opalozoa;MAST-3;MAST-3J;MAST-3J_X;MAST-3J_XX;MAST-3J_XX_sp.
## 4                                                  Eukaryota;Stramenopiles;Opalozoa;MAST-3;MAST-3J;MAST-3J_X;MAST-3J_XX;MAST-3J_XX_sp.
## 5                                                  Eukaryota;Stramenopiles;Opalozoa;MAST-3;MAST-3J;MAST-3J_X;MAST-3J_XX;MAST-3J_XX_sp.
## 6                                                Eukaryota;Opisthokonta;Metazoa;Cnidaria;Cnidaria_X;Hydrozoa;Aglaura;Aglaura_hemistoma
##     Kingdom    Supergroup   Division      Class               Order
## 1 Eukaryota      Rhizaria Radiolaria Acantharea Acantharea-Group-II
## 2 Eukaryota Stramenopiles   Opalozoa     MAST-3             MAST-3J
## 3 Eukaryota Stramenopiles   Opalozoa     MAST-3             MAST-3J
## 4 Eukaryota Stramenopiles   Opalozoa     MAST-3             MAST-3J
## 5 Eukaryota Stramenopiles   Opalozoa     MAST-3             MAST-3J
## 6 Eukaryota  Opisthokonta    Metazoa   Cnidaria          Cnidaria_X
##                  Family                  Genus                    Species
## 1 Acantharea-Group-II_X Acantharea-Group-II_XX Acantharea-Group-II_XX_sp.
## 2             MAST-3J_X             MAST-3J_XX             MAST-3J_XX_sp.
## 3             MAST-3J_X             MAST-3J_XX             MAST-3J_XX_sp.
## 4             MAST-3J_X             MAST-3J_XX             MAST-3J_XX_sp.
## 5             MAST-3J_X             MAST-3J_XX             MAST-3J_XX_sp.
## 6              Hydrozoa                Aglaura          Aglaura_hemistoma
##                   Taxa COUNT_AVG               Taxa2    Broad_Taxa
## 1  Rhizaria-Radiolaria        80 Acantharea-Group-II      Rhizaria
## 2   Stramenopiles-MAST        36              MAST-3 Stramenopiles
## 3   Stramenopiles-MAST        12              MAST-3 Stramenopiles
## 4   Stramenopiles-MAST        34              MAST-3 Stramenopiles
## 5   Stramenopiles-MAST        15              MAST-3 Stramenopiles
## 6 Opisthokonta-Metazoa        21            Cnidaria  Opisthokonta
##                                 category_final    RES_COSMO
## 1                             Unique to sample        Other
## 2 ASV appears in BSW and throughout vent/plume Cosmopolitan
## 3 ASV appears in BSW and throughout vent/plume Cosmopolitan
## 4 ASV appears in BSW and throughout vent/plume Cosmopolitan
## 5 ASV appears in BSW and throughout vent/plume Cosmopolitan
## 6 ASV appears in BSW and throughout vent/plume Cosmopolitan
countbac <- read.delim("data-input/CountTable-wtax-16s-plus3-2020-06-23.txt")
# colnames(countbac)
bac_data_interact_fulltax <- bac_data_interact %>%
  select(FEATURE, TAX_SHORT = Taxon_BAC) %>%
  separate(FEATURE, c("domain", "Feature.ID"), sep = "_", remove = FALSE) %>% 
  left_join(select(countbac, Feature.ID, TAX_FULL = Taxon)) %>% 
  select(FEATURE, TAX_FULL, TAX_SHORT) %>% 
  data.frame
## Joining, by = "Feature.ID"
# head(bac_data_interact_fulltax)
# head(gr_tax_res)

# Make taxonomy key
tax_key_se <- euk_data_interact %>% 
  select(FEATURE, TAX_FULL = Taxon_EUK) %>% 
  separate(FEATURE, c("domain", "Feature.ID"), sep = "_", remove = FALSE) %>%
  left_join(select(gr_tax_res, Feature.ID, TAX_SHORT = Taxa, EUK_2 = Taxa2, EUK_DIST = RES_COSMO, Broad_Taxa)) %>% 
  select(FEATURE, TAX_FULL, TAX_SHORT, EUK_2, EUK_DIST, Broad_Taxa) %>% 
  bind_rows(bac_data_interact_fulltax) %>% 
  distinct() %>% 
  data.frame
## Joining, by = "Feature.ID"
# View(tax_key_se)
reformat_spieceasi <- function(df_in){
  interaction <- c("PROK-EUK", "EUK-PROK")
  df_in %>% 
    rownames_to_column(var = "SIDEA") %>%
    pivot_longer(cols = starts_with(c("prok", "euk")), names_to = "SIDEB") %>%
    mutate(domain_a = case_when(
      grepl("prok", SIDEA) ~ "PROK",
      grepl("euk", SIDEA) ~ "EUK"),
    domain_b = case_when(
      grepl("prok", SIDEB) ~ "PROK",
      grepl("euk", SIDEB) ~ "EUK")) %>%
    mutate(COMBO = paste(domain_a, domain_b, sep = "-")) %>%
    mutate(COMBO_TYPE = case_when(
      COMBO %in% interaction ~ "cross",
      TRUE ~ "same"),
      SIG_ID = paste(SIDEA, SIDEB, sep ="-")) %>%
    select(-starts_with("domain")) %>% 
    left_join(select(tax_key_se, TAX_SIDEA = TAX_FULL, everything()), by = c("SIDEA" = "FEATURE")) %>% 
    left_join(select(tax_key_se, TAX_SIDEB = TAX_FULL, everything()), by = c("SIDEB" = "FEATURE"), suffix = c(".A", ".B")) %>%
    data.frame
}

df_adj_long <- reformat_spieceasi(df_adj)
df_beta_long <- reformat_spieceasi(df_beta)

8.2.3 Evaluate statistical parameters determine weight threshold

Evaluate the range of weighted outputs from SpiecEasi. Determine if a threshold can be set.

# Get list of these parameters
# Adjacency matrix - binary, where 1 = significant interaction
# Boot strapped pvalue, showing weight of each correlation
adj_sig <- df_adj_long %>% 
  filter(value == 1) %>% 
  filter(COMBO_TYPE == "cross") %>% 
  select(everything(), Adjacency = value) %>% 
  left_join(select(df_beta_long, SIG_ID, Weight = value)) %>% 
  data.frame
## Joining, by = "SIG_ID"
colnames(adj_sig)
##  [1] "SIDEA"        "SIDEB"        "Adjacency"    "COMBO"        "COMBO_TYPE"  
##  [6] "SIG_ID"       "TAX_SIDEA"    "TAX_SHORT.A"  "EUK_2.A"      "EUK_DIST.A"  
## [11] "Broad_Taxa.A" "TAX_SIDEB"    "TAX_SHORT.B"  "EUK_2.B"      "EUK_DIST.B"  
## [16] "Broad_Taxa.B" "Weight"
dim(adj_sig); dim(df_adj_long) # 1004 significant interactions that are cross-domain
## [1] 1004   17
## [1] 133225     16
# Isolate the unique interactions and make a table for export
complete_list <- adj_sig %>% 
  filter(COMBO == "EUK-PROK") %>%
  separate(SIDEA, c("domain", "ASV_18S"), sep = "_") %>% 
  separate(SIDEB, c("domain2", "ASV_16S"), sep = "_") %>% 
  select(-starts_with("domain"), -COMBO, -COMBO_TYPE, -SIG_ID, TAX_18S = TAX_SIDEA, TAX_16S = TAX_SIDEB) %>% 
  data.frame
# View(complete_list)
# write_delim(complete_list, path = "Complete-cross-domain-interactions.txt", delim = "\t")

8.2.4 Compare relationships at the taxonomic group level

Of the interactions between 18S- and 16S-derived data, we are interested in capturing the putative predator prey relationships

tax_sum_interact <- adj_sig %>% 
  filter(COMBO == "EUK-PROK") %>%
  separate(SIDEA, c("domain", "ASV_18S"), sep = "_") %>% 
  separate(SIDEB, c("domain2", "ASV_16S"), sep = "_") %>% 
  select(-starts_with("domain"), -COMBO, -COMBO_TYPE, -SIG_ID, -Adjacency) %>% 
  unite(INTERACTION, TAX_SHORT.A, TAX_SHORT.B, sep = "_", remove = FALSE) %>% 
  add_column(COUNT = 1) %>% 
  data.frame
# View(tax_sum_interact)
length(unique(tax_sum_interact$INTERACTION)) #Total significant interactions between euk and bac
## [1] 146
# table(tax_sum_interact$INTERACTION)
# head(tax_sum_interact)
unique(tax_sum_interact$TAX_SHORT.A)
##  [1] "Rhizaria-Radiolaria"        "Alveolata-Syndiniales"     
##  [3] "Stramenopiles-Other"        "Alveolata-Dinoflagellates" 
##  [5] "Stramenopiles-MAST"         "Hacrobia-Haptophyta"       
##  [7] "Hacrobia-Other"             "Stramenopiles-Ochrophyta"  
##  [9] "Alveolata-Ciliates"         "Opisthokonta-Other"        
## [11] "Archaeplastida-Chlorophyta" "Opisthokonta-Metazoa"      
## [13] "Rhizaria-Cercozoa"          "Hacrobia-Cryptophyta"      
## [15] "Unassigned-Eukaryote"
# Table of significant interactions
summary_sig_interactions <- tax_sum_interact %>% 
  select(ASV_18S, ASV_16S, TAX_SHORT.A, COUNT) %>% 
  distinct() %>% 
  group_by(TAX_SHORT.A) %>% 
  summarise(COUNT_18S_ASVs = n_distinct(ASV_18S)) %>% 
  data.frame
## `summarise()` ungrouping output (override with `.groups` argument)
# View(summary_sig_interactions) # Included in Table 2

8.2.5 Plot distribution of interactions

library(ggalluvial)
# head(tax_sum_interact)
putative_prey <- tax_sum_interact %>% 
  filter(!(Broad_Taxa.A == "Unassigned-Eukaryote")) %>% 
  group_by(Broad_Taxa.A, TAX_SHORT.A, TAX_SHORT.B) %>% 
  summarise(count_sum = sum(COUNT)) %>% 
  data.frame
## `summarise()` regrouping output by 'Broad_Taxa.A', 'TAX_SHORT.A' (override with `.groups` argument)
putative_prey$BROAD_TAXA_ORDER <- factor(putative_prey$Broad_Taxa.A, levels = c("Alveolata-Ciliates", "Alveolata", "Rhizaria", "Stramenopiles", "Hacrobia", "Amoebozoa", "Excavata", "Archaeplastida", "Opisthokonta", "Unassigned-Eukaryote"))
broad_color <- c("#ae017e","#bd0026","#ec7014","#238b45","#016c59","#08519c","#54278f","#810f7c")

# head(putative_prey)
# svg("18s-16s-alluvial-interaction.svg", h = 7, w = 9)
ggplot(putative_prey,
       aes(axis1 = TAX_SHORT.A, axis2 = TAX_SHORT.B, y = count_sum)) +
  scale_x_discrete(limits = c("TAX_SHORT.A", "TAX_SHORT.B"), expand = c(.2, .05)) +
  # xlab("Demographic") +
  geom_alluvium(aes(fill = Broad_Taxa.A), alpha = 0.5) +
  scale_fill_manual(values = broad_color) +
  geom_stratum(size = 0.5) +
  geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        legend.title = element_blank()) +
  labs(y = "Total Interactions", x = "", title = "18S-16S interactions")

# dev.off()

8.2.6 Plot distribution of interactions with respect to protistan classification

library(ggalluvial)
# head(tax_sum_interact)
putative_prey_dist <- tax_sum_interact %>% 
  filter(!(Broad_Taxa.A == "Unassigned-Eukaryote")) %>% 
  group_by(Broad_Taxa.A, TAX_SHORT.A, TAX_SHORT.B, EUK_DIST.A) %>% 
  summarise(count_sum = sum(COUNT)) %>% 
  data.frame
## `summarise()` regrouping output by 'Broad_Taxa.A', 'TAX_SHORT.A', 'TAX_SHORT.B' (override with `.groups` argument)
broad_order <- c("Alveolata-Ciliates", "Alveolata", "Rhizaria", "Stramenopiles", "Hacrobia", "Amoebozoa", "Excavata", "Archaeplastida", "Opisthokonta", "Unassigned-Eukaryote")
putative_prey_dist$BROAD_TAXA_ORDER <- factor(putative_prey_dist$Broad_Taxa.A, levels = broad_order)
broad_color <- c("#ae017e","#bd0026","#ec7014","#238b45","#016c59","#08519c","#54278f","#810f7c", "black", "grey")
names(broad_color) <- broad_order
# svg("18s-16s-alluvial-interaction-COSMO.svg", h = 7, w = 9)
putative_prey_dist %>% 
  filter(EUK_DIST.A == "Cosmopolitan") %>%
  filter(!(BROAD_TAXA_ORDER == "Opisthokonta")) %>% 
  filter(!(BROAD_TAXA_ORDER == "Unassigned-Eukaryote")) %>% 
  ggplot(aes(axis1 = TAX_SHORT.A, axis2 = TAX_SHORT.B, y = count_sum)) +
    scale_x_discrete(limits = c("TAX_SHORT.A", "TAX_SHORT.B"), expand = c(.2, .05)) +
    # xlab("Demographic") +
    geom_alluvium(aes(fill = BROAD_TAXA_ORDER), alpha = 0.5) +
    scale_fill_manual(values = broad_color) +
    geom_stratum(size = 0.5) +
    geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
    theme_minimal() +
    theme(axis.text.x = element_blank(),
          legend.title = element_blank()) +
    labs(y = "Total Interactions", x = "", title = "18S-16S interactions")

# dev.off()
# svg("18s-16s-alluvial-interaction-RESIDENT.svg", h = 7, w = 9)
putative_prey_dist %>% 
  filter(EUK_DIST.A == "Resident") %>% 
  ggplot(aes(axis1 = TAX_SHORT.A, axis2 = TAX_SHORT.B, y = count_sum)) +
    scale_x_discrete(limits = c("TAX_SHORT.A", "TAX_SHORT.B"), expand = c(.2, .05)) +
    # xlab("Demographic") +
    geom_alluvium(aes(fill = BROAD_TAXA_ORDER), alpha = 0.5) +
    scale_fill_manual(values = broad_color) +
    geom_stratum(size = 0.5) +
    geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
    theme_minimal() +
    theme(axis.text.x = element_blank(),
          legend.title = element_blank()) +
    labs(y = "Total Interactions", x = "", title = "18S-16S interactions")

# dev.off()

8.2.7 Create 3rd axis for alluvial plot to show location

# head(bac_data_interact) # Full taxonomy
# head(euk_data_interact)
euk_tmp <- euk_data_interact %>% 
  separate(FEATURE, c("domain", "Feature.ID"), sep = "_", remove = TRUE) %>%
  select(Feature.ID, Loc.A = LocationName)
bac_tmp <- bac_data_interact %>% 
  separate(FEATURE, c("domain", "Feature.ID"), sep = "_", remove = TRUE) %>%
  select(Feature.ID, Loc.B = LocationName)

p_p_wloc <- tax_sum_interact %>% 
  type.convert(as.is = TRUE) %>%
  left_join(euk_tmp, by = c("ASV_18S" = "Feature.ID")) %>% 
  left_join(bac_tmp, by = c("ASV_16S" = "Feature.ID")) %>% 
  mutate(Loc_Interact = case_when(
    as.character(Loc.A) == as.character(Loc.B) ~ "SAMELOCATION")) %>% 
  filter(Loc_Interact == "SAMELOCATION") %>% 
  filter(!(Broad_Taxa.A == "Unassigned-Eukaryote")) %>% 
  group_by(Broad_Taxa.A, TAX_SHORT.A, TAX_SHORT.B, Loc.A) %>% 
  summarise(count_sum = sum(COUNT)) %>% 
  data.frame
## `summarise()` regrouping output by 'Broad_Taxa.A', 'TAX_SHORT.A', 'TAX_SHORT.B' (override with `.groups` argument)
# head(p_p_wloc)
dim(putative_prey); dim(p_p_wloc)
## [1] 144   5
## [1] 600   5
p_p_wloc$BROAD_TAXA_ORDER <- factor(p_p_wloc$Broad_Taxa.A, levels = c("Alveolata-Ciliates", "Alveolata", "Rhizaria", "Stramenopiles", "Hacrobia", "Amoebozoa", "Excavata", "Archaeplastida", "Opisthokonta", "Unassigned-Eukaryote"))
broad_color <- c("#ae017e","#bd0026","#ec7014","#238b45","#016c59","#08519c","#54278f","#810f7c")
# svg("Interactions-wlocation-Protistsinmiddle.svg", h = 10, w = 10)
# svg("Interactions-wlocation.svg", h = 10, w = 10)
# ggplot(p_p_wloc, aes(axis1 = TAX_SHORT.A, axis2 = TAX_SHORT.B, axis3 = Loc.A, y = count_sum)) +
#   # scale_x_discrete(limits = c("Loc.A", "TAX_SHORT.A", "TAX_SHORT.B"), expand = c(.2, .05)) +
#   scale_x_discrete(limits = c("TAX_SHORT.A", "TAX_SHORT.B", "Loc.A"), expand = c(.2, .05)) +
#   geom_alluvium(aes(fill = Broad_Taxa.A), alpha = 0.7) +
#   scale_fill_manual(values = broad_color) +
#   geom_stratum(size = 0.5) +
#   geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 2) +
#   theme_minimal() +
#   theme(axis.text.x = element_blank(),
#         legend.title = element_blank()) +
#   labs(y = "Total Interactions", x = "", title = "18S-16S interactions")
# dev.off()

9 Session Info

sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS/LAPACK: /Users/sarahhu/anaconda3/envs/r_3.6.0/lib/R/lib/libRblas.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggalluvial_0.12.1    ggrepel_0.8.2        ape_5.3             
##  [4] RColorBrewer_1.1-2   cluster_2.1.0        compositions_1.40-5 
##  [7] bayesm_3.1-4         robustbase_0.93-6    tensorA_0.36.1      
## [10] ade4_1.7-15          vegan_2.5-6          lattice_0.20-41     
## [13] permute_0.9-5        plotly_4.9.2.1       decontam_1.6.0      
## [16] phyloseq_1.30.0      patchwork_1.0.0.9000 cowplot_1.0.0       
## [19] reshape2_1.4.4       forcats_0.5.0        stringr_1.4.0       
## [22] dplyr_1.0.0          purrr_0.3.4          readr_1.3.1         
## [25] tidyr_1.1.0          tibble_3.0.1         ggplot2_3.3.1       
## [28] tidyverse_1.3.0     
## 
## loaded via a namespace (and not attached):
##  [1] VGAM_1.1-3          colorspace_1.4-1    ellipsis_0.3.1     
##  [4] XVector_0.26.0      fs_1.4.1            rstudioapi_0.11    
##  [7] farver_2.0.3        fansi_0.4.1         lubridate_1.7.8    
## [10] xml2_1.3.2          codetools_0.2-16    splines_3.6.1      
## [13] knitr_1.28          SpiecEasi_1.1.0     jsonlite_1.6.1     
## [16] broom_0.7.0         dbplyr_1.4.4        compiler_3.6.1     
## [19] httr_1.4.1          backports_1.1.7     assertthat_0.2.1   
## [22] Matrix_1.2-18       lazyeval_0.2.2      cli_2.0.2          
## [25] htmltools_0.4.0     tools_3.6.1         igraph_1.2.5       
## [28] gtable_0.3.0        glue_1.4.1          Rcpp_1.0.5         
## [31] Biobase_2.46.0      cellranger_1.1.0    vctrs_0.3.0        
## [34] Biostrings_2.54.0   multtest_2.42.0     nlme_3.1-148       
## [37] iterators_1.0.12    crosstalk_1.1.0.1   xfun_0.14          
## [40] rvest_0.3.5         lifecycle_0.2.0     DEoptimR_1.0-8     
## [43] zlibbioc_1.32.0     MASS_7.3-51.6       scales_1.1.1       
## [46] hms_0.5.3           parallel_3.6.1      biomformat_1.14.0  
## [49] huge_1.3.4.1        rhdf5_2.30.1        yaml_2.2.1         
## [52] stringi_1.4.6       S4Vectors_0.24.4    foreach_1.5.0      
## [55] BiocGenerics_0.32.0 shape_1.4.4         rlang_0.4.6        
## [58] pkgconfig_2.0.3     evaluate_0.14       Rhdf5lib_1.8.0     
## [61] htmlwidgets_1.5.1   labeling_0.3        tidyselect_1.1.0   
## [64] plyr_1.8.6          magrittr_1.5        R6_2.4.1           
## [67] IRanges_2.20.2      generics_0.0.2      DBI_1.1.0          
## [70] pillar_1.4.4        haven_2.3.1         withr_2.2.0        
## [73] mgcv_1.8-31         survival_3.1-12     pulsar_0.3.7       
## [76] modelr_0.1.8        crayon_1.3.4        utf8_1.1.4         
## [79] rmarkdown_2.2       grid_3.6.1          readxl_1.3.1       
## [82] data.table_1.12.8   blob_1.2.1          reprex_0.3.0       
## [85] digest_0.6.25       glmnet_4.0-2        stats4_3.6.1       
## [88] munsell_0.5.0       viridisLite_0.3.0